In [1]:
!pip install --no-cache-dir --upgrade "sentence-transformers==3.4.1" "chromadb==0.6.3" "transformers==4.48.3" "xgboost==2.1.4"



[93m#################

Arguments ['--no-cache-dir'] might not be applied correctly across cluster, please check our documentation for supported flags: https://docs.anyscale.com/configuration/dependency-management/dependency-development

#################[0m


[92mSuccessfully registered `sentence-transformers, chromadb` and 2 other packages to be installed on all cluster nodes.[0m
[92mView and update dependencies here: https://console.anyscale.com/cld_g54aiirwj1s8t9ktgzikqur41k/prj_f1j47h9srml4cyg962id75ms2e/workspaces/expwrk_llznag1z2l695idvwp9pkhy1sj?workspace-tab=dependencies[0m


In [2]:
import json
import requests


# Running a LLM with Ray

__Road Map__:
* Ray Core + Huggingface
* Motivating Actors
* Ray Core Actor
* Using Actors in Ray AI Libraries
* Ray Data
* Ray Serve

In [3]:
from transformers import pipeline
import ray

CHAT_MODEL = 'Qwen/Qwen2.5-0.5B-Instruct'

prompt = "Tell me something about large language models."

@ray.remote(num_gpus=1) 
# Ray accounts for resources for schedule/load purposes and sets CUDA_VISIBLE_DEVICES but does not enforce resource usage quotas
def basic_hf(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    pipe = pipeline("text-generation", model=CHAT_MODEL, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})
    return pipe(messages, max_new_tokens=200, truncation=True)

In [4]:
ref = basic_hf.remote(prompt)

2026-01-20 21:23:13,248	INFO worker.py:1821 -- Connecting to existing Ray cluster at address: 10.0.142.230:6379...
2026-01-20 21:23:13,263	INFO worker.py:1998 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-v4klp1kjtnk9yrxwdcz5ah11ub.i.anyscaleuserdata.com [39m[22m
2026-01-20 21:23:13,303	INFO packaging.py:463 -- Pushing file package 'gcs://_ray_pkg_0bd8078b0063d4195929ce96a7cf436461a67169.zip' (9.62MiB) to Ray cluster...
2026-01-20 21:23:13,348	INFO packaging.py:476 -- Successfully pushed file package 'gcs://_ray_pkg_0bd8078b0063d4195929ce96a7cf436461a67169.zip'.


In [5]:
ray.get(ref)

[36m(basic_hf pid=38379, ip=10.0.177.60)[0m Device set to use cuda


[{'generated_text': [{'role': 'system',
    'content': 'You are a helpful assistant.'},
   {'role': 'user',
    'content': 'Tell me something about large language models.'},
   {'role': 'assistant',
    'content': "Large language models (LLMs) are artificial intelligence programs that can produce human-like text based on the input they receive. These LLMs are often used for tasks such as translation, summarization, and question-answering.\n\nOne of the key features of LLMs is their ability to generate vast amounts of text quickly and efficiently. They use techniques such as neural networks and transformers to understand the meaning of words and phrases and generate natural-sounding responses.\n\nLLMs have been used in a wide range of applications, including customer service, healthcare, legal research, and more. For example, companies like Google and IBM have developed LLMs that can assist with customer support by generating responses to common questions.\n\nHowever, it's important to 

What's wrong with this code? We're loading the model from storage on every call.

What's the solution? Ray Actors!

Define an actor

In [6]:
@ray.remote(num_gpus=0.15)
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})

    def reply(self, prompts):
        messages = [ [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ] for prompt in prompts]
        return self.pipe(messages, max_new_tokens=200, truncation=True)

Instantiate one or more actors

In [7]:
chat = Chat.remote(CHAT_MODEL)

Make multiple calls to the same actor

In [8]:
queries = ["What are some top attractions in Seattle?", "What are some top attractions in Los Angeles?"]

ref = chat.reply.remote(queries) 

ray.get(ref)

[36m(Chat pid=40427, ip=10.0.162.181)[0m Device set to use cuda


[[{'generated_text': [{'role': 'system',
     'content': 'You are a helpful assistant.'},
    {'role': 'user', 'content': 'What are some top attractions in Seattle?'},
    {'role': 'assistant',
     'content': 'Seattle is a city with many beautiful and historic attractions, including:\n\n1. The Microsoft Building - a former computer company building that has been converted into a museum.\n\n2. The Kingdome - home to the Seattle Mariners baseball team.\n\n3. The Seattle Aquarium - a marine life center located on the waterfront.\n\n4. The Space Needle - an iconic suspension bridge that offers stunning views of the city.\n\n5. The Pike Place Market - a bustling market filled with local shops, food vendors, and live music.\n\n6. The University of Washington - a world-renowned university with a rich history dating back to 1870.\n\n7. The University of Washington Stadium - the home field for the Seattle Seahawks football team.\n\n8. The Boeing Company Museum - a historical museum dedicated t

In [9]:
more_queries = ["What are some top attractions in Vancouver BC?", "What are some top attractions in Portland OR?"]

ray.get(chat.reply.remote(more_queries))

[[{'generated_text': [{'role': 'system',
     'content': 'You are a helpful assistant.'},
    {'role': 'user',
     'content': 'What are some top attractions in Vancouver BC?'},
    {'role': 'assistant',
     'content': "Vancouver, British Columbia is one of Canada's most popular tourist destinations and home to many amazing attractions. Here are some of the top ones:\n\n1. Stanley Park - A stunning park with beautiful lakeside views and numerous hiking trails.\n\n2. The Vancouver Art Gallery - One of the largest art museums in North America, it houses an impressive collection of Canadian and international works.\n\n3. The Vancouver Aquarium - Home to over 400 marine animals from around the world, including sea turtles, sharks, and dolphins.\n\n4. Granville Island - A small island off the coast of Vancouver that offers stunning views of the city and the ocean.\n\n5. Mount St. Elias - A mountain range located on Vancouver Island, offering breathtaking views and hiking opportunities.\n\n

## Ray Data + Actors: LLM Processing Pipeline

In [10]:
cp prompts.parquet /mnt/cluster_storage/prompts.parquet

In [11]:
prompts = ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet')

In [12]:
prompts.limit(5).take_batch()

2026-01-20 21:25:17,147	INFO logging.py:397 -- Registered dataset logger for dataset dataset_209_0
2026-01-20 21:25:17,170	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_209_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:25:17,171	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_209_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=5]
2026-01-20 21:25:17,172	INFO streaming_executor.py:687 -- [dataset]: A new progress UI is available. To enable, set `ray.data.DataContext.get_current().enable_rich_progress_bars = True` and `ray.data.DataContext.get_current().use_ray_tqdm = False`.
2026-01-20 21:25:17,173	INFO progress_bar.py:155 -- Progress bar disabled because stdout is a non-interactive terminal.
2026-01-20 21:25:17,200	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:25:17,200	INFO progress_bar.p

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?',
        'What is the name of Phileas Fogg’s loyal servant?'], dtype=object)}

Recall that the workhorse of Ray Data processing pipelines is the `map_batches` API call on a `Dataset`

`map_batches` supports stateless (tasks) and statefull (actors) processing

In [13]:
class PromptEnhancer:
    def __call__(self, batch):
        inputs = batch['prompt']
        outputs = [ [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt }
        ] for prompt in inputs]
        batch['enhanced_prompt'] = outputs
        return batch

In [14]:
prompts.limit(5).map_batches(PromptEnhancer, concurrency=2).take_batch()

2026-01-20 21:25:27,594	INFO logging.py:397 -- Registered dataset logger for dataset dataset_212_0
2026-01-20 21:25:27,600	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_212_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:25:27,600	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_212_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=5] -> ActorPoolMapOperator[MapBatches(PromptEnhancer)]
2026-01-20 21:25:27,775	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:25:27,776	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 21:25:27,777	INFO progress_bar.py:213 -- === Ray Data Progress {ReadFiles} ===
2026-01-20 21:25:27,778	INFO progress_bar.py:215 -- ReadFiles: Tasks: 0; Actors: 0; Queued bl

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?',
        'What is the name of Phileas Fogg’s loyal servant?'], dtype=object),
 'enhanced_prompt': array([array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Describe the body of water in Utah?', 'role': 'user'}],
              dtype=object)                                                        ,
        array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Tell as much as you can about the robbery?', 'role': 'user'}],
              dtype=object)                                                               ,
        array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Did Phileas Fogg really rob the bank?', 'role': 'user'}],
 

Similar result, different internal semantics

In [15]:
prompts.map_batches(PromptEnhancer, concurrency=2).take_batch(5)

2026-01-20 21:25:30,932	INFO logging.py:397 -- Registered dataset logger for dataset dataset_214_0
2026-01-20 21:25:30,936	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_214_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:25:30,936	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_214_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=5] -> ActorPoolMapOperator[MapBatches(PromptEnhancer)]
2026-01-20 21:25:31,095	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:25:31,096	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 21:25:31,097	INFO progress_bar.py:213 -- === Ray Data Progress {ReadFiles} ===
2026-01-20 21:25:31,098	INFO progress_bar.py:215 -- ReadFiles: Tasks: 0; Actors: 0; Queued bl

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?',
        'What is the name of Phileas Fogg’s loyal servant?'], dtype=object),
 'enhanced_prompt': array([array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Describe the body of water in Utah?', 'role': 'user'}],
              dtype=object)                                                        ,
        array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Tell as much as you can about the robbery?', 'role': 'user'}],
              dtype=object)                                                               ,
        array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Did Phileas Fogg really rob the bank?', 'role': 'user'}],
 

Once we have prompts, we can extend the pipeline with our LLM Chat functionality.

In [16]:
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})
    
    def __call__(self, batch):
        enhanced_prompts = [[j for j in i] for i in batch['enhanced_prompt']]
        batch['responses'] = self.pipe(enhanced_prompts, max_new_tokens=200, truncation=True)
        return batch

Note that in this usage, we specify the resources in the `map_batches` call rather than on the actor (class) definition itself.

As we'll see later, this provides more flexibility for performance and scaling purposes.

In [17]:
prompts \
    .limit(5) \
    .map_batches(PromptEnhancer, concurrency=2) \
    .map_batches(Chat, concurrency=(2,4), fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4) \
    .take_batch()

2026-01-20 21:25:35,497	INFO logging.py:397 -- Registered dataset logger for dataset dataset_218_0
2026-01-20 21:25:35,503	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_218_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:25:35,503	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_218_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=5] -> ActorPoolMapOperator[MapBatches(PromptEnhancer)] -> ActorPoolMapOperator[MapBatches(Chat)]
2026-01-20 21:25:35,707	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:25:35,708	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 21:25:35,708	INFO progress_bar.py:213 -- === Ray Data Progress {ReadFiles} ===
2026-01-20 21:25:35,710	INFO progress_bar.py:215 --

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?',
        'What is the name of Phileas Fogg’s loyal servant?'], dtype=object),
 'enhanced_prompt': array([array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Describe the body of water in Utah?', 'role': 'user'}],
              dtype=object)                                                        ,
        array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Tell as much as you can about the robbery?', 'role': 'user'}],
              dtype=object)                                                               ,
        array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': 'Did Phileas Fogg really rob the bank?', 'role': 'user'}],
 