cloudera · ewilliams-cloudera · Jun 24, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/README.md b/README.md
@@ -150,6 +150,18 @@ the Node service locally, you can do so by following these steps:
 docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/databases/qdrant_storage:/qdrant/storage:z qdrant/qdrant
 ```
 
+#### Modifying UI in CML
+
+* This is an unsupported workflow, but it is possible to modify the UI code in CML.
+
+- Start a CML Session from a CML Project that has the RAG Studio AMP installed.
+- Open the terminal in the CML Session and navigate to the `ui` directory.
+- Run `source ~/.bashrc` to ensure the Node environment variables are loaded.
+- Install PNPM using `npm install -g pnpm`.  Docs on PNPM can be found here: https://pnpm.io/installation#using-npm
+- Run `pnpm install` to install the dependencies.
+- Make your changes to the UI code in the `ui` directory.
+- Run `pnpm build` to build the new UI bundle.
+
 ## The Fine Print
 
 IMPORTANT: Please read the following before proceeding. This AMP includes or otherwise depends on certain third party software packages. Information about such third party software packages are made available in the notice file associated with this AMP. By configuring and launching this AMP, you will cause such third party software packages to be downloaded and installed into your environment, in some instances, from third parties' websites. For each third party software package, please see the notice file and the applicable websites for more information, including the applicable license terms. If you do not wish to download and install the third party software packages, do not configure, launch or otherwise use this AMP. By configuring, launching or otherwise using the AMP, you acknowledge the foregoing statement and agree that Cloudera is not responsible or liable in any way for the third party software packages.
diff --git a/llm-service/app/ai/indexing/summary_indexer.py b/llm-service/app/ai/indexing/summary_indexer.py
@@ -75,7 +75,7 @@
 from ..vector_stores.vector_store_factory import VectorStoreFactory
 from ...config import settings
 from ...services.metadata_apis import data_sources_metadata_api
-from ...services.models.providers import CAIIModelProvider
+from ...services.models.providers import CAIIModelProvider, AzureModelProvider, OpenAiModelProvider
 
 logger = logging.getLogger(__name__)
 
@@ -132,9 +132,11 @@ def __index_configuration(
         embed_summaries: bool = True,
     ) -> Dict[str, Any]:
         prompt_helper: Optional[PromptHelper] = None
-        # if we're using CAII, let's be conservative, and use a small context window to account for mistral's small context
+        # if we're using CAII, let's be conservative and use a small context window to account for mistral's small context
         if CAIIModelProvider.is_enabled():
             prompt_helper = PromptHelper(context_window=3000)
+        if AzureModelProvider.is_enabled() or OpenAiModelProvider.is_enabled():
+            prompt_helper = PromptHelper(context_window=min(llm.metadata.context_window, 10000))
         return {
             "llm": llm,
             "response_synthesizer": get_response_synthesizer(

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
@@ -302,8 +302,9 @@ def generate_stream() -> Generator[str, None, None]:
                     yield f"data: {event_json}\n\n"
                     first_message = False
                 response_id = response.additional_kwargs["response_id"]
-                json_delta = json.dumps({"text": response.delta})
-                yield f"data: {json_delta}\n\n"
+                if response.delta:
+                    json_delta = json.dumps({"text": response.delta})
+                    yield f"data: {json_delta}\n\n"
 
             if not cancel_event.is_set() and response_id:
                 done = ChatEvent(type="done", name="chat_done", timestamp=time.time())

diff --git a/llm-service/app/services/query/agents/non_streamer_bedrock_converse.py b/llm-service/app/services/query/agents/non_streamer_bedrock_converse.py
@@ -0,0 +1,97 @@
+from typing import (
+    Sequence,
+    Optional,
+    Union,
+    Any,
+    List,
+    AsyncGenerator,
+)
+
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+)
+from llama_index.core.tools import BaseTool
+from llama_index.llms.bedrock_converse import BedrockConverse
+
+
+class FakeStreamBedrockConverse(BedrockConverse):
+    """
+    A class that inherits from BedrockConverse but overrides its astream_chat_with_tools function.
+    This class is used to create a non-streaming version of the BedrockConverse.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Initialize the FakeStreamBedrockConverse class.
+        """
+        super().__init__(*args, **kwargs)
+
+    async def astream_chat_with_tools(
+        self,
+        tools: Sequence["BaseTool"],
+        user_msg: Optional[Union[str, ChatMessage]] = None,
+        chat_history: Optional[List[ChatMessage]] = None,
+        verbose: bool = False,
+        allow_parallel_tool_calls: bool = False,
+        tool_required: bool = False,
+        **kwargs: Any,
+    ) -> AsyncGenerator[ChatResponse, None]:
+        # This method is overridden to provide a non-streaming version of the chat with tools.
+        # Here we yield a single ChatResponse object instead of streaming multiple responses.
+        async def _fake_stream() -> AsyncGenerator[ChatResponse, None]:
+            response = await self.achat_with_tools(
+                tools=tools,
+                user_msg=user_msg,
+                chat_history=chat_history,
+                verbose=verbose,
+                allow_parallel_tool_calls=allow_parallel_tool_calls,
+                tool_required=tool_required,
+                **kwargs,
+            )
+            yield response
+
+        return _fake_stream()
+
+    @classmethod
+    def from_bedrock_converse(
+        cls, bedrock_converse: BedrockConverse
+    ) -> "FakeStreamBedrockConverse":
+        """
+        Create a FakeStreamBedrockConverse object from a BedrockConverse object.
+
+        Args:
+            bedrock_converse: A BedrockConverse object
+
+        Returns:
+            A FakeStreamBedrockConverse object with the same public attributes as the input BedrockConverse
+        """
+        # Create a new instance of FakeStreamBedrockConverse with only the public parameters
+        # Let the parent class handle initialization of private attributes
+        return cls(
+            model=bedrock_converse.model,
+            temperature=bedrock_converse.temperature,
+            max_tokens=bedrock_converse.max_tokens,
+            additional_kwargs=bedrock_converse.additional_kwargs,
+            callback_manager=bedrock_converse.callback_manager,
+            system_prompt=bedrock_converse.system_prompt,
+            messages_to_prompt=bedrock_converse.messages_to_prompt,
+            completion_to_prompt=bedrock_converse.completion_to_prompt,
+            pydantic_program_mode=bedrock_converse.pydantic_program_mode,
+            output_parser=bedrock_converse.output_parser,
+            profile_name=bedrock_converse.profile_name,
+            aws_access_key_id=bedrock_converse.aws_access_key_id,
+            aws_secret_access_key=bedrock_converse.aws_secret_access_key,
+            aws_session_token=bedrock_converse.aws_session_token,
+            region_name=bedrock_converse.region_name,
+            api_version=bedrock_converse.api_version,
+            use_ssl=bedrock_converse.use_ssl,
+            verify=bedrock_converse.verify,
+            endpoint_url=bedrock_converse.endpoint_url,
+            timeout=bedrock_converse.timeout,
+            max_retries=bedrock_converse.max_retries,
+            guardrail_identifier=bedrock_converse.guardrail_identifier,
+            guardrail_version=bedrock_converse.guardrail_version,
+            application_inference_profile_arn=bedrock_converse.application_inference_profile_arn,
+            trace=bedrock_converse.trace,
+        )
diff --git a/llm-service/app/services/query/agents/tool_calling_querier.py b/llm-service/app/services/query/agents/tool_calling_querier.py
@@ -57,6 +57,8 @@
 from llama_index.core.schema import NodeWithScore
 from llama_index.core.tools import BaseTool
 from llama_index.core.workflow import StopEvent
+from llama_index.llms.bedrock_converse import BedrockConverse
+from llama_index.llms.bedrock_converse.utils import get_model_name
 
 from app.ai.indexing.summary_indexer import SummaryIndexer
 from app.services.metadata_apis.session_metadata_api import Session
@@ -65,6 +67,9 @@
 from app.services.query.agents.agent_tools.retriever import (
     build_retriever_tool,
 )
+from app.services.query.agents.non_streamer_bedrock_converse import (
+    FakeStreamBedrockConverse,
+)
 from app.services.query.chat_engine import (
     FlexibleContextChatEngine,
 )
@@ -84,6 +89,16 @@
     "mistral.mixtral-8x7b-instruct-v0:1",
 }
 
+BEDROCK_STREAMING_TOOL_MODELS = {
+    "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    "anthropic.claude-3-7-sonnet-20250219-v1:0",
+    "anthropic.claude-sonnet-4-20250514-v1:0",
+    "anthropic.claude-opus-4-20250514-v1:0",
+    "amazon.nova-pro-v1:0",
+    "cohere.command-r-plus-v1:0",
+    "cohere.command-r-v1:0",
+}
+
 
 def should_use_retrieval(
     data_source_ids: list[int],
@@ -101,27 +116,53 @@ def should_use_retrieval(
 
 
 DEFAULT_AGENT_PROMPT = """\
-Today's date is {date} and the current time is {time}. \
+### DATE AND TIME
+Today's date is {date} and the current time is {time}. This date and time \
+is considered the current date and time for all responses. \
 
+### ROLE DESCRIPTION
 You are an expert agent that can answer questions with the help of tools. \
-Go through the tools available and use them appropriately to answer the \
-user's question. If you do not know the answer to a question, you \
-truthfully say it does not know. As the agent, you will provide an \
-answer based solely on the provided sources with citations to the \
-paragraphs. 
-
-Note for in-line citations:
-* Use the citations from the chat history as is. 
-* Use links provided by the tools if needed to answer the question and cite them in-line \
+You will use the date and time provided above to answer questions \
+to refine the user's query and provide the best possible answer. \
+
+### BEST PRACTICES
+You will follow these best practices when answering questions:
+1. Refining the user's query according to the date \
+and time provided above if necessary, and if the user 
+has not provided enough information to answer the question. \
+2. Going through the tools available.
+3. Approaching the question step by step, using the tools 
+available to you to gather information when necessary. 
+4. Once you have the information you need, you will provide \
+a final answer to the user with citations if available \
+you used to answer the question.
+5. If you do not know the answer to a question or cannot find the information \
+you need to answer the question with the provided sources or tools, \
+you truthfully say you do not know and let the user know how you arrived \
+at the response and what information you used (links if any) to arrive \
+at it and ask for clarification or more information. 
+
+### OUTPUT FORMAT
+As the agent, you will provide an answer based solely on the provided \
+sources with citations (if available). Only return the answer with \
+citations (if used) to the user. If you cannot answer the question with the \
+provided sources or tools, you will return a message saying you \
+cannot answer the question and ask the user to provide more \
+information or clarify the question. \
+
+### CITATION FORMAT
+You will use the following format to cite sources in your response:
+* Use the citations from the chat history as is.
+* Use links provided by tool results if needed to answer the question and cite them in-line \
 in the given format: the link should be in markdown format. For example: \
 Refer to the example in [example.com](https://example.com). Do not make up links that are not \
-present. 
-* Cite from node_ids in the given format: the node_id \
+present.
+* Cite from tool results with node_ids in the given format: the node_id \
 should be in an html anchor tag (<a href>) with an html 'class' of 'rag_citation'. \
 Do not use filenames as citations. Only node ids should be used. \
 For example: <a class="rag_citation" href="2" ></a>. Do not make up node ids that are not present 
 in the context.
-* All citations should be either in-line citations or markdown links. 
+* All citations should be either in-line citations or markdown links.
 
 For example:
 
@@ -380,29 +421,29 @@ def gen() -> Generator[ChatResponse, None, None]:
 def build_function_agent(
     enhanced_query: str, llm: FunctionCallingLLM, tools: list[BaseTool]
 ) -> tuple[FunctionAgent, str]:
+    formatted_prompt = DEFAULT_AGENT_PROMPT.format(
+        date=datetime.datetime.now().strftime("%A, %B %d, %Y"),
+        time=datetime.datetime.now().strftime("%H:%M:%S %p"),
+    )
+    callable_tools = cast(list[BaseTool | Callable[[], Any]], tools)
     if llm.metadata.model_name in NON_SYSTEM_MESSAGE_MODELS:
-        agent = FunctionAgent(
-            tools=cast(list[BaseTool | Callable[[], Any]], tools),
-            llm=llm,
-        )
+        agent = FunctionAgent(tools=callable_tools, llm=llm)
         enhanced_query = (
             "ROLE DESCRIPTION =========================================\n"
-            + DEFAULT_AGENT_PROMPT.format(
-                date=datetime.datetime.now().strftime("%Y-%m-%d"),
-                time=datetime.datetime.now().strftime("%H:%M:%S"),
-            )
+            + formatted_prompt
             + "=========================================================\n"
             "USER QUERY ==============================================\n"
             + enhanced_query
         )
     else:
+        if (
+            isinstance(llm, BedrockConverse)
+            and get_model_name(llm.metadata.model_name)
+            not in BEDROCK_STREAMING_TOOL_MODELS
+        ):
+            llm = FakeStreamBedrockConverse.from_bedrock_converse(llm)
         agent = FunctionAgent(
-            tools=cast(list[BaseTool | Callable[[], Any]], tools),
-            llm=llm,
-            system_prompt=DEFAULT_AGENT_PROMPT.format(
-                date=datetime.datetime.now().strftime("%Y-%m-%d"),
-                time=datetime.datetime.now().strftime("%H:%M:%S"),
-            ),
+            tools=callable_tools, llm=llm, system_prompt=formatted_prompt
         )
 
     return agent, enhanced_query
diff --git a/llm-service/app/services/query/querier.py b/llm-service/app/services/query/querier.py
@@ -37,7 +37,10 @@
 from llama_index.core.llms import LLM
 from llama_index.core.llms.function_calling import FunctionCallingLLM
 from llama_index.core.schema import NodeWithScore
-from llama_index.llms.bedrock_converse.utils import BEDROCK_FUNCTION_CALLING_MODELS
+from llama_index.llms.bedrock_converse.utils import (
+    BEDROCK_FUNCTION_CALLING_MODELS,
+    get_model_name,
+)
 from llama_index.llms.openai.utils import (
     is_function_calling_model,
     ALL_AVAILABLE_MODELS,
@@ -77,6 +80,15 @@
 
 logger = logging.getLogger(__name__)
 
+LLAMA_3_2_NON_FUNCTION_CALLING_MODELS = {
+    "meta.llama3-2-1b-instruct-v1:0",
+    "meta.llama3-2-3b-instruct-v1:0",
+}
+
+MODIFIED_BEDROCK_FUNCTION_CALLING_MODELS = tuple(
+    set(BEDROCK_FUNCTION_CALLING_MODELS) - LLAMA_3_2_NON_FUNCTION_CALLING_MODELS
+)
+
 
 def streaming_query(
     chat_engine: Optional[FlexibleContextChatEngine],
@@ -126,12 +138,20 @@ def streaming_query(
     return chat_response
 
 
+# LlamaIndex's list of function-calling models appears out of date,
+# so we have a modified version
+def is_bedrock_function_calling_model_v2(model_name: str) -> bool:
+    return get_model_name(model_name) in MODIFIED_BEDROCK_FUNCTION_CALLING_MODELS
+
+
 def check_for_tool_calling_support(llm: LLM) -> None:
-    if BedrockModelProvider.is_enabled() and not llm.metadata.is_function_calling_model:
+    if BedrockModelProvider.is_enabled() and not is_bedrock_function_calling_model_v2(
+        llm.metadata.model_name
+    ):
         raise HTTPException(
             status_code=422,
-            detail=f"Tool calling is enabled, but the model {llm.metadata.model_name} does not support tool calling.  "
-            f"The following models support tool calling: {', '.join(list(BEDROCK_FUNCTION_CALLING_MODELS))}.",
+            detail=f"Tool calling is enabled, but the model {get_model_name(llm.metadata.model_name)} does not support tool calling.  "
+            f"The following models support tool calling: {', '.join(list(MODIFIED_BEDROCK_FUNCTION_CALLING_MODELS))}.",
         )
     if (
         OpenAiModelProvider.is_enabled() or AzureModelProvider.is_enabled()

diff --git a/ui/src/pages/RagChatTab/Settings/ChatSettingsModal.tsx b/ui/src/pages/RagChatTab/Settings/ChatSettingsModal.tsx
@@ -52,7 +52,7 @@ import {
 import { useGetLlmModels, useGetRerankingModels } from "src/api/modelsApi.ts";
 import { transformModelOptions } from "src/utils/modelUtils.ts";
 import { ResponseChunksRange } from "pages/RagChatTab/Settings/ResponseChunksSlider.tsx";
-import { useContext } from "react";
+import { useContext, useEffect } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import {
   UpdateSessionRequest,
@@ -77,8 +77,8 @@ const ChatSettingsModal = ({
   const { data: rerankingModels } = useGetRerankingModels();
   const {
     dataSourcesQuery: { dataSources },
+    activeSession,
   } = useContext(RagChatContext);
-  const { activeSession } = useContext(RagChatContext);
   const [form] = Form.useForm<Omit<CreateSessionType, "id">>();
   const queryClient = useQueryClient();
   const updateSession = useUpdateSessionMutation({
@@ -99,6 +99,14 @@ const ChatSettingsModal = ({
     return null;
   }
 
+  useEffect(() => {
+    if (activeSession.name) {
+      form.setFieldsValue({
+        name: activeSession.name,
+      });
+    }
+  }, [activeSession.name, form.setFieldsValue]);
+
   const handleUpdateSession = () => {
     form
       .validateFields()