# Parallel computation with Ray

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/coobas/europython-25/blob/main/98-ray.ipynb)

In [None]:
# Run this in Google Collab, perhaps not elsewhere
!pip install numpy ray[default] polars
!europython-25/*.parquet local_data/
!git clone https://github.com/coobas/europython-25.git
!mkdir -p local_data
!cp europython-25/*.parquet local_data/

In [5]:
import numpy as np
import polars as pl
import ray

from pathlib import Path

In [11]:
# Constants
N_POINTS = 10   # Number of points in each dimension for the grid
LIMIT = 10.0    # +/- Span of the grid
DEFAULT_K = 4   # How many nearest neighbors to consider

In [7]:
def calculate_distances(query_points: np.ndarray, reference_points: np.ndarray) -> np.ndarray:
    """
    Calculate mutual distances between M query and N reference points.

    Returns:
    --------
    distances: np.ndarray
        (N, M) array of the distances
    """
    # Expand for broadcasting
    query_points = query_points[:, :, np.newaxis]
    reference_points = reference_points[:3, np.newaxis]
    return np.sqrt(np.sum((reference_points - query_points) ** 2, axis=0))


def knn_search(
    query_points: np.ndarray,
    reference_points: np.ndarray,
    k: int,
    distances_func=calculate_distances,
):
    """
    Find k nearest neighbour reference point indices for N query points.

    Returns:
    --------
    indices: np.ndarray
        (N, k) matrix of integral indices

    """
    distances = distances_func(query_points, reference_points).T
    nearest_indices = np.argpartition(distances, k, axis=0)[:k].T
    return nearest_indices

In [12]:
def create_point_grid(n_points: int = N_POINTS) -> tuple[np.ndarray, ...]:
    """
    Create a homogenous grid of points to create a map.

    Returns:
    --------
    x: np.ndarray
        Flattened (N_POINTS x N_POINTS,) array of x values
    y: np.ndarray
        Flattened (N_POINTS x N_POINTS,) array of x values
    """
    # TODO: Add floor
    x = np.linspace(-LIMIT, LIMIT, n_points)
    y = np.linspace(-LIMIT, LIMIT, n_points)
    return tuple(arr.flatten() for arr in np.meshgrid(x, y))


def create_query_points(n_points: int = N_POINTS, floor: int = 1) -> np.ndarray:
    """
    Create a homogenous grid of points with a floor to create a map.

    Returns:
    --------
    query_points: np.ndarray
        (n_points x n_points, 3) array of query points
    """
    x, y = create_point_grid(n_points=n_points)
    return np.vstack([x, y, np.ones(x.shape[0]) * floor])

In [9]:
def compute_prices(query_points: np.ndarray, reference_points: np.ndarray) -> np.ndarray:
    """
    Find prices for N data_points.

    Returns:
    --------
    prices: np.ndarray
        (N,) array of prices
    """
    indices = knn_search(query_points, reference_points, DEFAULT_K)
    prices: np.ndarray = reference_points[3][indices]
    return prices.mean(axis=1)


def combine_points_and_prices(
    query_points: np.ndarray, prices: np.ndarray
) -> pl.DataFrame:
    """
    Prepare human-friendly output from numpy arrays.

    Returns:
    --------
    df: pl.DataFrame
        DataFrame with columns x, y, floor, price
    """
    return pl.DataFrame(
        {
            "x": query_points[0],
            "y": query_points[1],
            "floor": query_points[2],
            "price": prices,
        }
    )

In [10]:
def load_reference_points(path: Path = Path("local_data/data.parquet")) -> np.ndarray:
    """
    Load reference data points from a Parquet file.

    Returns:
    --------
    data_points: np.ndarray
        (N, 4) array of data points with x, y, floor, and price columns
    """

    df = pl.read_parquet(path)
    return np.vstack(
        [
            df["x"].to_numpy(),
            df["y"].to_numpy(),
            df["floor"].to_numpy(),
            df["price"].to_numpy(),
        ]
    )

load_reference_points()

array([[-2.50919762e+00,  9.01428613e+00,  4.63987884e+00, ...,
         8.93415830e+00, -2.05024015e+00, -5.65719192e+00],
       [-2.52718363e+00, -3.34175808e+00, -6.47692175e+00, ...,
        -3.92603062e+00, -1.13359987e+00, -6.55470371e+00],
       [ 1.00000000e+00,  3.00000000e+00,  1.20000000e+01, ...,
         1.60000000e+01,  1.00000000e+00,  1.40000000e+01],
       [ 9.83737837e+02,  5.20453425e+02,  6.21498431e+02, ...,
         6.73260987e+02,  1.32561175e+03,  6.46913957e+02]],
      shape=(4, 10000))

## Run without ray

In [None]:
reference_points = load_reference_points()
query_points = create_query_points(n_points=21)  # 21x21 grid

In [None]:
prices = compute_prices(query_points, reference_points)
prices

In [None]:
%%time
points_and_prices = combine_points_and_prices(query_points=query_points, prices=prices)
points_and_prices

In [15]:
@ray.remote
def long_running_task():
    import time
    time.sleep(60)
    return "Finished long running task"

In [18]:
ray.get(long_running_task.remote())

'Finished long running task'

## Monitoring ray

Ray comes with a nice dashboard that allows you to observe running jobs. It runs in a local web server, mostly likely http://localhost:8265. This address is not accessible when running within Google Colab, and so you have to use a special trick to show a mini-window forwarded to the dashboard running in the cloud.

In [None]:
try:
    from google.colab import output
    output.serve_kernel_port_as_iframe(8265)
except ImportError:
    print("Not in google Colab. Try the local link, it might work.")

Not in google Colab. Try the local link, it might work.


## Compute prices in ray

In [None]:
@ray.remote
def compute_prices(query_points: np.ndarray, reference_points: np.ndarray) -> np.ndarray:
    """
    Find prices for N data_points.

    Returns:
    --------
    prices: np.ndarray
        (N,) array of prices
    """
    indices = knn_search(query_points, reference_points, DEFAULT_K)
    prices: np.ndarray = reference_points[3][indices]
    return prices.mean(axis=1)

In [14]:
reference_points = load_reference_points()
query_points = create_query_points(n_points=21)  

ray.init(ignore_reinit_error=True)

prices = compute_prices.remote(query_points, reference_points)
prices

2025-07-13 21:09:45,253	INFO worker.py:1879 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


AttributeError: 'function' object has no attribute 'remote'

In [None]:
ray.get(prices)