In [None]:
import json
import ray
import requests
from ray import serve
from transformers import pipeline

# Running a LLM with Ray

__Road Map__:
* Ray Core + Huggingface
* Motivating Actors
* Ray Core Actor
* Using Actors in Ray AI Libraries
* Ray Data
* Ray Serve

In [None]:
CHAT_MODEL = 'Qwen/Qwen2.5-0.5B-Instruct'

prompt = "Tell me something about large language models."

@ray.remote(num_gpus=1) 
# Ray accounts for resources for schedule/load purposes and sets CUDA_VISIBLE_DEVICES but does not enforce resource usage quotas
def basic_hf(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    pipe = pipeline("text-generation", model=CHAT_MODEL, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})
    return pipe(messages, max_new_tokens=200, truncation=True)

In [None]:
ref = basic_hf.remote(prompt)

In [None]:
ray.get(ref)

What's wrong with this code? We're loading the model from storage on every call.

What's the solution? Ray Actors!

Define an actor

In [None]:
@ray.remote(num_gpus=0.15)
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})

    def reply(self, prompts):
        messages = [ [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ] for prompt in prompts]
        return self.pipe(messages, max_new_tokens=200, truncation=True)

Instantiate one or more actors

In [None]:
chat = Chat.remote(CHAT_MODEL)

Make multiple calls to the same actor

In [None]:
queries = ["What are some top attractions in Seattle?", "What are some top attractions in Los Angeles?"]

ref = chat.reply.remote(queries) 

ray.get(ref)

In [None]:
more_queries = ["What are some top attractions in Vancouver BC?", "What are some top attractions in Portland OR?"]

ray.get(chat.reply.remote(more_queries))

## Ray Data + Actors: LLM Processing Pipeline

In [None]:
cp prompts.parquet /mnt/cluster_storage/prompts.parquet

In [None]:
prompts = ray.data.read_parquet('/mnt/cluster_storage/prompts.parquet')

In [None]:
prompts.limit(5).take_batch()

Recall that the workhorse of Ray Data processing pipelines is the `map_batches` API call on a `Dataset`

`map_batches` supports stateless (tasks) and statefull (actors) processing

In [None]:
class PromptEnhancer:
    def __call__(self, batch):
        inputs = batch['prompt']
        outputs = [ [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt }
        ] for prompt in inputs]
        batch['enhanced_prompt'] = outputs
        return batch

In [None]:
prompts.limit(5).map_batches(PromptEnhancer, concurrency=2).take_batch()

Similar result, different internal semantics

In [None]:
prompts.map_batches(PromptEnhancer, concurrency=2).take_batch(5)

Once we have prompts, we can extend the pipeline with our LLM Chat functionality.

In [None]:
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})
    
    def __call__(self, batch):
        enhanced_prompts = [[j for j in i] for i in batch['enhanced_prompt']]
        batch['responses'] = self.pipe(enhanced_prompts, max_new_tokens=200, truncation=True)
        return batch

Note that in this usage, we specify the resources in the `map_batches` call rather than on the actor (class) definition itself.

As we'll see later, this provides more flexibility for performance and scaling purposes.

In [None]:
prompts \
    .limit(5) \
    .map_batches(PromptEnhancer, concurrency=2) \
    .map_batches(Chat, concurrency=(2,4), fn_constructor_args=[CHAT_MODEL], num_gpus=0.15, batch_size=4) \
    .take_batch()

# Bonus: Ray Actor as a service for low-latency inference with Ray Serve
## Moving toward production-grade hosting with Ray Serve

Ray Actors can be used to host a service providing basic encapsulation and RPC for internal clients

For more robust services Ray Serve adds scalability, load balancing, and more.

### What is Ray Serve?

Serve is a microservices framework for serving ML â€“ the model serving
component of Ray AI Libraries.

<img src='https://technical-training-assets.s3.us-west-2.amazonaws.com/Ray_Serve/serve_architecture.png' width=700/>

### Deployments

`Deployment` is the fundamental developer-facing element of serve.

<img src='https://technical-training-assets.s3.us-west-2.amazonaws.com/Ray_Serve/deployment.png' width=600/>

In [None]:
@serve.deployment(ray_actor_options={"num_gpus": 0.15})
class Chat:
    def __init__(self, model: str):
        self.pipe = pipeline("text-generation", model=model, device='cuda', model_kwargs={"cache_dir": "/mnt/local_storage"})

    def reply(self, prompts):
        messages = [ [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ] for prompt in prompts]
        return self.pipe(messages, max_new_tokens=200, truncation=True)

handle = serve.run(Chat.bind(model=CHAT_MODEL), name='chat')

In [None]:
ref = handle.reply.remote(queries)

In [None]:
await ref

Ray Serve has various other capabilites, including gRPC/HTTP access, FastAPI compatibility, inplace upgrading of components, and more.

In [None]:
serve.delete('chat')

In [None]:
serve.shutdown()