In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
import ray
from transformers import pipeline
import numpy as np
import random

# Ray Data Pipelines for RAG Applications

Our initial goal is to use Ray Data to implement a RAG pipeline for the following flow:

[Read Queries] => [Generate Embeddings] => [Retrieve Matching Docs] => [Build LLM Prompts] => [Get LLM Responses] => [Store Output]

Once we have that working, we'll look at a couple of other patterns for working with multiple models.

In [None]:
EMBEDDER_MODEL = 'hkunlp/instructor-large'
CHAT_MODEL = 'Qwen/Qwen2.5-0.5B-Instruct'

In [None]:
data = ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet')
data.take_batch(4)

In [None]:
class Embedder:
    def __init__(self, model: str):
        self._model = SentenceTransformer(EMBEDDER_MODEL)
        
    def __call__(self, batch):
        batch['prompt_embedding'] = self._model.encode(batch['prompt'], device='cuda:0')
        return batch

In [None]:
data \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=2, num_gpus=0.1, batch_size=4) \
    .take_batch(4)

We can implement a vector-db lookup service as an actor and use it for batch retrieval of documents matching a query

In [None]:
class ChromaDBReader:
    def __init__(self, collection: str, top_n: int):        
        chroma_client = chromadb.PersistentClient(path="/mnt/cluster_storage/vector_store")
        self._coll = chroma_client.get_collection(collection)
        self._top_n = top_n
    
    def __call__(self, batch):
        vecs = list(batch['prompt_embedding'])
        batch['responsive_documents'] = self._coll.query(query_embeddings=vecs, n_results=self._top_n,)['documents']
        return batch

In [None]:
data \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=2, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], concurrency=2) \
    .take_batch(4)

There may be some retrieval quality issue, but those are not our concern right now.

Next, we can create a component to enhance the prompt with context and instructions for the LLM

In [None]:
class PromptEnhancer:
    def __init__(self):
        self._base_prompt = """You are a helpful assistant who can answer questions about a text based on your existing knowledge and documents supplied here.
        When answering questions, use the following relevant excerpts from the text:
        { newline.join([doc for doc in docs]) } 
        If you don't have information to answer a question, please say you don't know. Don't make up an answer.\n"""
    
    def __call__(self, batch):
        original_prompts = batch['prompt']
        enhanced_prompts = []
        newline = '\n'
        
        for ix, original_prompt in enumerate(original_prompts):
            docs = batch['responsive_documents'][ix]
            enhanced_prompts.append([ {"role": "system", "content": "You are a helpful assistant."},
                                      {"role": "user", "content": eval(f'f"""{self._base_prompt}"""') + original_prompt } ])

        batch['enhanced_prompt'] = enhanced_prompts
        return batch

In [None]:
data \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=2, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], concurrency=2) \
    .map_batches(PromptEnhancer, concurrency=2) \
    .take_batch(4)

And now we can add out batch LLM processing to the pipeline

In [None]:
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda:0', model_kwargs={"cache_dir": "/mnt/local_storage"})
    
    def __call__(self, batch):
        enhanced_prompts = [[j for j in i] for i in batch['enhanced_prompt']] # nested arrays to nested lists -- adjust as needed and/or for perf
        batch['responses'] = self.pipe(enhanced_prompts, max_new_tokens=200, truncation=True)
        return batch

Since the output is getting larger at this point, for visual inspection we'll store a batch to a Python object and then print out some results

In [None]:
output = data \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=4, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], concurrency=2) \
    .map_batches(PromptEnhancer, concurrency=2) \
    .map_batches(Chat, concurrency=2, fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4) \
    .take_batch(23)

In [None]:
def print_visual_eval(batch):
    for r in batch['responses']:
        print(r[0]['generated_text'][1]['content'].split('\n')[-1])
        print()
        print(r[0]['generated_text'][2]['content'])
        print('----------------\n')

In [None]:
print_visual_eval(output)

If we were happy with the pipeline, we might run it at larger scale and write the ouput to storage, into a database, kafka, etc.

In [None]:
ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet') \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=4, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 3], concurrency=2) \
    .map_batches(PromptEnhancer, concurrency=2) \
    .map_batches(Chat, concurrency=2, fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4) \
    .write_parquet('/mnt/cluster_storage/batch_output_1.parquet')

## Multimodel pipelines with routing

In [None]:
BIGGER_CHAT_MODEL='Qwen/Qwen2.5-1.5B-Instruct'

The simplest way -- though possibly not the best way -- to do model routing would be
* create a dataprocessing actor class like the `Chat` class
* insert business logic to
    * load multiple models in the constructor and use them for various subsets of the record batch, depending on some criteria or control flow
    * or run the control flow and then call out to some other service -- e.g., another Actor or Actor Pool -- to do the inference
* collect the results
* return the updated batch

However, there are some patterns which may allow for more optimization and tuning.

First, we'll look at using a router actor which chooses a target model for each record (at random, in this example).

We'll then pass the data in sequence to 2 different `FilteredChat` processing actors which only handle the records assigned to them.

> This pattern can also be expanded. For example, perhaps we score every record with the first (small) model, then the router applied some evaluation model to rate the results and assigns unsatisfactory records to be scored (again) with the second (larger, more sophisticated, but more expensive) model.

In [None]:
class Router:
    def __init__(self, models):
        self._models = models
    
    def __call__(self, batch):
        batch['target_model'] = random.choices(self._models, k=len(batch['prompt']))
        return batch
    
class FilteredChat:
    def __init__(self, model: str):
        self._model = model
        self.pipe = pipeline("text-generation", model=model, device='cuda:0', model_kwargs={"cache_dir": "/mnt/local_storage"})
    
    def __call__(self, batch):
        indices = np.argwhere(batch['target_model']==self._model).flatten()
        
        prompts = batch['enhanced_prompt'][indices]
        prompts = [[j for j in i] for i in prompts]     
        responses = self.pipe(prompts, max_new_tokens=200, truncation=True)
        
        if not 'responses' in batch:
            batch['responses'] = np.empty(len(batch['enhanced_prompt']), dtype=object)
        batch['responses'][indices] = responses
        return batch

We can also experiment with more docs from the vector store, different concurrency numbers, etc.

In [None]:
ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet') \
    .repartition(4) \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=4, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 5], concurrency=4) \
    .map_batches(PromptEnhancer, concurrency=4) \
    .map_batches(Router, concurrency=2, fn_constructor_args=[[CHAT_MODEL, BIGGER_CHAT_MODEL]]) \
    .map_batches(FilteredChat, concurrency=2, fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=8) \
    .map_batches(FilteredChat, concurrency=2, fn_constructor_args=[BIGGER_CHAT_MODEL], num_gpus=0.5, batch_size=8) \
    .take_batch(16)

One additional Ray Data pattern is to split the dataset using the `.filter` API and then send the filtered streams to dedicated `Chat` processors for the respective models.
* This pattern might have potential benefits in keeping the batch sizes uniform at the point of LLM inference.

However, Ray by default would read and pre-process the whole dataset before running the filter and then the inference on the target subset.
* We can work around this by using `.materialize` to cache the dataset at the point where we are ready to "branch" it with filter operations.
* This caching approach will use up object store memory and, most likely for a large dataset, will also spill to disk across the nodes.
    * Caching + spilling is not necessarily a problem but we should understand the tradeoffs, especially for very large datasets

In [None]:
ready_to_filter = ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet') \
    .repartition(4) \
    .map_batches(Embedder, fn_constructor_args=[EMBEDDER_MODEL], concurrency=4, num_gpus=0.1, batch_size=4) \
    .map_batches(ChromaDBReader, fn_constructor_args=['persistent_text_chunks', 5], concurrency=4) \
    .map_batches(PromptEnhancer, concurrency=4) \
    .map_batches(Router, concurrency=2, fn_constructor_args=[[CHAT_MODEL, BIGGER_CHAT_MODEL]]) \
    .materialize()

In [None]:
small_model_outputs = ready_to_filter \
    .filter(expr=f"target_model=='{CHAT_MODEL}'") \
    .map_batches(Chat, concurrency=2, fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4)

In [None]:
large_model_outputs = ready_to_filter \
    .filter(expr=f"target_model=='{BIGGER_CHAT_MODEL}'") \
    .map_batches(Chat, concurrency=2, fn_constructor_args=[BIGGER_CHAT_MODEL], num_gpus=0.5, batch_size=4)

In [None]:
results = small_model_outputs.union(large_model_outputs)

In [None]:
results.take_batch(12)