In [1]:
from sentence_transformers import SentenceTransformer
import chromadb
import ray
from transformers import pipeline
import numpy as np
import random

# Ray Data Pipelines for RAG Applications

Our initial goal is to use Ray Data to implement a RAG pipeline for the following flow:

[Read Queries] => [Generate Embeddings] => [Retrieve Matching Docs] => [Build LLM Prompts] => [Get LLM Responses] => [Store Output]

Once we have that working, we'll look at a couple of other patterns for working with multiple models.

In [2]:
EMBEDDER_MODEL = 'hkunlp/instructor-large'
CHAT_MODEL = 'Qwen/Qwen2.5-0.5B-Instruct'

In [3]:
data = ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet')
data.take_batch(4)

2026-01-20 21:50:07,138	INFO worker.py:1821 -- Connecting to existing Ray cluster at address: 10.0.142.230:6379...
2026-01-20 21:50:07,151	INFO worker.py:1998 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-v4klp1kjtnk9yrxwdcz5ah11ub.i.anyscaleuserdata.com [39m[22m
2026-01-20 21:50:07,178	INFO packaging.py:463 -- Pushing file package 'gcs://_ray_pkg_e2a29b9874fe44a725ecbe0b29635f9b90cf05d4.zip' (10.23MiB) to Ray cluster...
2026-01-20 21:50:07,218	INFO packaging.py:476 -- Successfully pushed file package 'gcs://_ray_pkg_e2a29b9874fe44a725ecbe0b29635f9b90cf05d4.zip'.
2026-01-20 21:50:07,402	INFO logging.py:397 -- Registered dataset logger for dataset dataset_252_0
2026-01-20 21:50:07,424	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_252_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:50:07,424	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_252_0: InputDataBuffer[I

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?'],
       dtype=object)}

In [5]:
class Embedder:
    def __init__(self, model: str):
        self._model = SentenceTransformer(EMBEDDER_MODEL)
        
    def __call__(self, batch):
        batch['prompt_embedding'] = self._model.encode(batch['prompt'], device='cuda:0')
        return batch

In [6]:
data.map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], compute=ray.data.ActorPoolStrategy(size=2), num_gpus=0.1, batch_size=4) \
    .take_batch(4)

2026-01-20 21:52:47,077	INFO logging.py:397 -- Registered dataset logger for dataset dataset_254_0
2026-01-20 21:52:47,082	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_254_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:52:47,083	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_254_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=4] -> ActorPoolMapOperator[MapBatches(Embedder)]
2026-01-20 21:52:47,245	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:52:47,246	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 21:52:47,247	INFO progress_bar.py:213 -- === Ray Data Progress {ReadFiles} ===
2026-01-20 21:52:47,248	INFO progress_bar.py:215 -- ReadFiles: Tasks: 0; Actors: 0; Queued blocks: 

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?'],
       dtype=object),
 'prompt_embedding': array([[-0.04239421, -0.01472014, -0.05605758, ..., -0.02838667,
         -0.00659643,  0.06030383],
        [-0.03958433, -0.01468273, -0.02359905, ..., -0.02142349,
         -0.01574307,  0.06896093],
        [-0.03950648, -0.00295585, -0.03587967, ..., -0.06156909,
          0.00685327,  0.07862456],
        [-0.03007179,  0.00322504, -0.0551949 , ..., -0.01952592,
          0.00284252,  0.05230937]], dtype=float32)}

We can implement a vector-db lookup service as an actor and use it for batch retrieval of documents matching a query

In [None]:
# On your head node, re-ingest your data to create a fresh collection
import chromadb
import shutil

# Create the database 
shutil.rmtree("/mnt/cluster_storage/vector_store", ignore_errors=True)
client = chromadb.PersistentClient(path="/mnt/cluster_storage/vector_store")
collection = client.create_collection("persistent_text_chunks")


In [10]:
class ChromaDBReader:
    def __init__(self, collection: str, top_n: int):        
        chroma_client = chromadb.PersistentClient(path="/mnt/cluster_storage/vector_store")
        self._coll = chroma_client.get_collection(collection)
        self._top_n = top_n
    
    def __call__(self, batch):
        vecs = list(batch['prompt_embedding'])
        batch['responsive_documents'] = self._coll.query(query_embeddings=vecs, n_results=self._top_n,)['documents']
        return batch

In [11]:
data.map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], compute=ray.data.ActorPoolStrategy(size=2), num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], compute=ray.data.ActorPoolStrategy(size=2)) \
    .take_batch(4)

2026-01-20 21:58:08,696	INFO logging.py:397 -- Registered dataset logger for dataset dataset_260_0
2026-01-20 21:58:08,700	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_260_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:58:08,701	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_260_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=4] -> ActorPoolMapOperator[MapBatches(Embedder)] -> ActorPoolMapOperator[MapBatches(ChromaDBReader)]
2026-01-20 21:58:08,910	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:58:08,911	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 21:58:08,913	INFO progress_bar.py:213 -- === Ray Data Progress {ReadFiles} ===
2026-01-20 21:58:08,914	INFO progress_bar.py:21

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?'],
       dtype=object),
 'prompt_embedding': array([[-0.04239421, -0.01472014, -0.05605758, ..., -0.02838667,
         -0.00659643,  0.06030383],
        [-0.03958433, -0.01468273, -0.02359905, ..., -0.02142349,
         -0.01574307,  0.06896093],
        [-0.03950648, -0.00295585, -0.03587967, ..., -0.06156909,
          0.00685327,  0.07862456],
        [-0.03007179,  0.00322504, -0.0551949 , ..., -0.01952592,
          0.00284252,  0.05230937]], dtype=float32),
 'responsive_documents': array([array([], dtype=object), array([], dtype=object),
        array([], dtype=object), array([], dtype=object)], dtype=object)}

There may be some retrieval quality issue, but those are not our concern right now.

Next, we can create a component to enhance the prompt with context and instructions for the LLM

In [12]:
class PromptEnhancer:
    def __init__(self):
        self._base_prompt = """You are a helpful assistant who can answer questions about a text based on your existing knowledge and documents supplied here.
        When answering questions, use the following relevant excerpts from the text:
        { newline.join([doc for doc in docs]) } 
        If you don't have information to answer a question, please say you don't know. Don't make up an answer.\n"""
    
    def __call__(self, batch):
        original_prompts = batch['prompt']
        enhanced_prompts = []
        newline = '\n'
        
        for ix, original_prompt in enumerate(original_prompts):
            docs = batch['responsive_documents'][ix]
            enhanced_prompts.append([ {"role": "system", "content": "You are a helpful assistant."},
                                      {"role": "user", "content": eval(f'f"""{self._base_prompt}"""') + original_prompt } ])

        batch['enhanced_prompt'] = enhanced_prompts
        return batch

In [13]:
data \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], compute=ray.data.ActorPoolStrategy(size=2), num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], compute=ray.data.ActorPoolStrategy(size=2)) \
    .map_batches(PromptEnhancer, compute=ray.data.ActorPoolStrategy(size=2)) \
    .take_batch(4)

2026-01-20 21:59:26,609	INFO logging.py:397 -- Registered dataset logger for dataset dataset_264_0
2026-01-20 21:59:26,614	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_264_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 21:59:26,615	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_264_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=4] -> ActorPoolMapOperator[MapBatches(Embedder)] -> ActorPoolMapOperator[MapBatches(ChromaDBReader)] -> ActorPoolMapOperator[MapBatches(PromptEnhancer)]
2026-01-20 21:59:26,871	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 21:59:26,872	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 21:59:26,874	INFO progress_bar.py:213 -- === Ray Data Progress {ReadFiles}

{'prompt': array(['Describe the body of water in Utah?',
        'Tell as much as you can about the robbery?',
        'Did Phileas Fogg really rob the bank?',
        'Who is the main protagonist of Around the World in 80 Days?'],
       dtype=object),
 'prompt_embedding': array([[-0.04239421, -0.01472014, -0.05605758, ..., -0.02838667,
         -0.00659643,  0.06030383],
        [-0.03958433, -0.01468273, -0.02359905, ..., -0.02142349,
         -0.01574307,  0.06896093],
        [-0.03950648, -0.00295585, -0.03587967, ..., -0.06156909,
          0.00685327,  0.07862456],
        [-0.03007179,  0.00322504, -0.0551949 , ..., -0.01952592,
          0.00284252,  0.05230937]], dtype=float32),
 'responsive_documents': array([array([], dtype=object), array([], dtype=object),
        array([], dtype=object), array([], dtype=object)], dtype=object),
 'enhanced_prompt': array([array([{'content': 'You are a helpful assistant.', 'role': 'system'},
               {'content': "You are a helpful as

And now we can add out batch LLM processing to the pipeline

In [14]:
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda:0', model_kwargs={"cache_dir": "/mnt/local_storage"})
    
    def __call__(self, batch):
        enhanced_prompts = [[j for j in i] for i in batch['enhanced_prompt']] # nested arrays to nested lists -- adjust as needed and/or for perf
        batch['responses'] = self.pipe(enhanced_prompts, max_new_tokens=200, truncation=True)
        return batch

Since the output is getting larger at this point, for visual inspection we'll store a batch to a Python object and then print out some results

In [15]:
output = data \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=4, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], compute=ray.data.ActorPoolStrategy(size=2)) \
    .map_batches(PromptEnhancer, compute=ray.data.ActorPoolStrategy(size=2)) \
    .map_batches(Chat, compute=ray.data.ActorPoolStrategy(size=2), fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4) \
    .take_batch(23)

2026-01-20 22:00:09,173	INFO logging.py:397 -- Registered dataset logger for dataset dataset_269_0
2026-01-20 22:00:09,178	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_269_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 22:00:09,179	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_269_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> LimitOperator[limit=23] -> ActorPoolMapOperator[MapBatches(Embedder)] -> ActorPoolMapOperator[MapBatches(ChromaDBReader)] -> ActorPoolMapOperator[MapBatches(PromptEnhancer)] -> ActorPoolMapOperator[MapBatches(Chat)]
2026-01-20 22:00:09,517	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 22:00:09,518	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 22:00:09,519	INFO progress_bar.

In [16]:
def print_visual_eval(batch):
    for r in batch['responses']:
        print(r[0]['generated_text'][1]['content'].split('\n')[-1])
        print()
        print(r[0]['generated_text'][2]['content'])
        print('----------------\n')

In [17]:
print_visual_eval(output)

Describe the body of water in Utah?

The body of water in Utah is the Great Salt Lake. It's a salt lake that covers an area of approximately 108,546 square miles (279,000 km²). The lake is located in the western part of Utah, near the Colorado River. It's one of the largest and most saline bodies of water in the United States, covering an area of around 30% of the state's total surface area.
----------------

Tell as much as you can about the robbery?

I'm sorry, but I cannot provide an answer to this question as there is no specific excerpt or text provided for me to reference. Please provide more context or information so that I may assist you better.
----------------

Did Phileas Fogg really rob the bank?

I'm sorry, but I do not have enough context or information to answer whether Phileas Fogg actually robbed the bank or not. The passage you provided does not mention anything about Phileas Fogg's actions in relation to the bank robbery. Therefore, it is not possible for me to deter

If we were happy with the pipeline, we might run it at larger scale and write the ouput to storage, into a database, kafka, etc.

In [18]:
ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet') \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=4, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], concurrency=2) \
    .map_batches(PromptEnhancer, concurrency=2) \
    .map_batches(Chat, concurrency=2, fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4) \
    .write_parquet('/mnt/cluster_storage/batch_output_1.parquet')

2026-01-20 22:01:28,658	INFO logging.py:397 -- Registered dataset logger for dataset dataset_276_0
2026-01-20 22:01:28,663	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_276_0. Full logs are in /tmp/ray/session_2026-01-20_18-18-31_241199_2386/logs/ray-data
2026-01-20 22:01:28,663	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_276_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> TaskPoolMapOperator[ReadFiles] -> ActorPoolMapOperator[MapBatches(Embedder)] -> ActorPoolMapOperator[MapBatches(ChromaDBReader)] -> ActorPoolMapOperator[MapBatches(PromptEnhancer)] -> ActorPoolMapOperator[MapBatches(Chat)] -> TaskPoolMapOperator[Write]
2026-01-20 22:01:29,045	INFO progress_bar.py:213 -- === Ray Data Progress {ListFiles} ===
2026-01-20 22:01:29,046	INFO progress_bar.py:215 -- ListFiles: Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 384.0MiB object store: Progress Completed 0 / ?
2026-01-20 22:01:29,047	INFO progress_b