In [17]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Annotated, Any, Literal, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd

# import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
# pl.Config.set_fmt_str_lengths(1_000)
# pl.Config.set_tbl_cols(n=1_000)
# pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import time

import ray

# ray.init()


# View resource usage
# print(ray.available_resources())

database: list[str] = [
    "Learning",
    "Ray",
    "Flexible",
    "Distributed",
    "Python",
    "for",
    "Machine",
    "Learning",
]


def retrieve(item: int) -> tuple[int, str]:
    time.sleep(item / 10)
    return item, database[item]


def print_runtime(input_data: list[tuple[int, str]], start_time: float) -> None:
    """Print the runtime of the operation and the resulting data."""
    print(f"Runtime: {time.time() - start_time:.2f} seconds, data:")
    print(*input_data, sep="\n")


@ray.remote
def retrieve_task(item: int, db: list[str]) -> tuple[int, str]:
    time.sleep(item / 10.0)
    return item, db[item]


def synchronous_call() -> None:
    """Execute the main program."""
    start: float = time.time()
    data: list[tuple[int, str]] = [retrieve(item) for item in range(8)]
    print_runtime(data, start)


if __name__ == "__main__":
    synchronous_call()

Runtime: 2.82 seconds, data:
(0, 'Learning')
(1, 'Ray')
(2, 'Flexible')
(3, 'Distributed')
(4, 'Python')
(5, 'for')
(6, 'Machine')
(7, 'Learning')


## Parallel Computing Using Ray

In [6]:
# Put the db in Ray's object store so that it can be
# accessed by all workers in the cluster.
db_object_ref = ray.put(database)


@ray.remote
def retrieve_task(item: int, db: list[str]) -> tuple[int, str]:
    time.sleep(item / 10.0)
    return item, db[item]


def parallel_call() -> None:
    """Execute the main program in parallel using Ray.

    Returns
    -------
    None
        The function prints the runtime and retrieved data.

    Notes
    -----
    object_reference : list[ray.ObjectRef]
        List of Ray object references of shape (8,)
    data : list[tuple[int, str]]
        Retrieved data of shape (8, 2)
    """
    start: float = time.time()
    # New! It returns a list of object references (i.e. a future)
    object_reference: list[ray.ObjectRef] = [
        retrieve_task.remote(item, db_object_ref) for item in range(8)
    ]
    # New
    data: list[tuple[int, str]] = ray.get(object_reference)
    print_runtime(data, start)


if __name__ == "__main__":
    parallel_call()

Runtime: 0.72 seconds, data:
(0, 'Learning')
(1, 'Ray')
(2, 'Flexible')
(3, 'Distributed')
(4, 'Python')
(5, 'for')
(6, 'Machine')
(7, 'Learning')


In [16]:
def parallel_call_updated_1() -> list[tuple[int, str]]:
    """Execute the main program in parallel using Ray.

    Returns
    -------
    None
        The function prints the runtime and retrieved data.

    Notes
    -----
    object_references : list[ray.ObjectRef]
        List of Ray object references of shape (8,)
    data : list[tuple[int, str]]
        Retrieved data of shape (8, 2)
    """
    start: float = time.time()
    # It returns a list of object references (i.e. a future)
    object_references: list[ray.ObjectRef] = [
        retrieve_task.remote(item, db_object_ref) for item in range(8)
    ]
    all_data: list[tuple[int, str]] = []
    timeout: float = 7.0

    # Instead of blocking, iterate thru the object references with a
    # max timeout of N seconds.
    while len(object_references) > 0:
        finished, object_references = ray.wait(
            object_references,
            num_returns=2,  # Return 2 results at a time.
            timeout=timeout,
        )
        data: list[tuple[int, str]] = ray.get(finished)
        print_runtime(data, start)
        all_data.extend(data)
    return all_data


if __name__ == "__main__":
    all_data: list[tuple[int, str]] = parallel_call_updated_1()

Runtime: 0.14 seconds, data:
(0, 'Learning')
(1, 'Ray')
Runtime: 0.34 seconds, data:
(2, 'Flexible')
(3, 'Distributed')
Runtime: 0.54 seconds, data:
(4, 'Python')
(5, 'for')
Runtime: 0.75 seconds, data:
(6, 'Machine')
(7, 'Learning')


In [13]:
all_data

[(0, 'Learning'),
 (1, 'Ray'),
 (2, 'Flexible'),
 (3, 'Distributed'),
 (4, 'Python'),
 (5, 'for'),
 (6, 'Machine'),
 (7, 'Learning')]

### Handling Task Dependencies

- Create a task that runs both `retrieve_task` and `follow_up_task` consecutively.

In [None]:
@ray.remote
def follow_up_task(
    retrieve_result: tuple[int, str],
) -> tuple[
    Annotated[tuple[int, str], "retrieve_result"],
    Annotated[tuple[int, str], "follow_up_result"],
]:
    """This is a follow-up task that takes the original item and retrieves
    the follow-up result."""
    original_item, _ = retrieve_result
    follow_up_result: tuple[int, str] = retrieve(item=original_item + 1)
    return [retrieve_result, follow_up_result]


def parallel_call_2() -> None:
    start: float = time.time()

    retrieved_refs: list[ray.ObjectRef] = [
        retrieve_task.remote(item, db_object_ref) for item in [0, 2, 4, 6]
    ]
    follow_up_refs = [follow_up_task.remote(res) for res in retrieved_refs]
    data = ray.get(follow_up_refs)
    print_runtime(data, start)
    return data


if __name__ == "__main__":
    # Call the main function
    all_data = parallel_call_2()

Runtime: 1.42 seconds, data:
[(0, 'Learning'), (1, 'Ray')]
[(2, 'Flexible'), (3, 'Distributed')]
[(4, 'Python'), (5, 'for')]
[(6, 'Machine'), (7, 'Learning')]


In [None]:
def parallel_call_3() -> None:
    start: float = time.time()

    all_data = []
    retrieved_refs: list[ray.ObjectRef] = [
        retrieve_task.remote(item, db_object_ref) for item in [0, 2, 4, 6]
    ]
    follow_up_refs = [follow_up_task.remote(ResourceWarning) for ResourceWarning in retrieved_refs]

    while len(follow_up_refs) > 0:
        done_refs, follow_up_refs = ray.wait(follow_up_refs, timeout=10)
        data = ray.get(done_refs)
        all_data.extend(data)
        print_runtime(data, start)

    return all_data


if __name__ == "__main__":
    # Call the main function
    all_data = parallel_call_3()

Runtime: 0.14 seconds, data:
[(0, 'Learning'), (1, 'Ray')]
Runtime: 0.60 seconds, data:
[(2, 'Flexible'), (3, 'Distributed')]
Runtime: 0.95 seconds, data:
[(4, 'Python'), (5, 'for')]
Runtime: 1.35 seconds, data:
[(6, 'Machine'), (7, 'Learning')]


In [26]:
names = ["Michael", "Sarah", "Joshua", "Narine", "David"]

for name in names:
    if cleaned_name := name.lower():
        print(cleaned_name)

michael
sarah
joshua
narine
david
