cloudera · jkwatson · Jul 8, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+# prebuilt_artifacts/* filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml
@@ -20,7 +20,6 @@ on:
           - customer-hotfix
 jobs:
   build:
-
     runs-on: ubuntu-latest
 
     steps:
@@ -29,6 +28,13 @@ jobs:
         with:
           ref: ${{ github.event.inputs.BRANCH }}
           ssh-key: ${{ secrets.DEPLOY_KEY }}
+          lfs: true
+
+      - name: Install Git LFS
+        run: |
+          sudo apt-get update && sudo apt-get install git-lfs
+      - name: Initialize Git LFS
+        run: git lfs install
 
       - name: Set up JDK 21
         uses: actions/setup-java@v4
@@ -102,14 +108,18 @@ jobs:
         run: |
             git config --local user.name actions-user
             git config --local user.email "actions@github.com"
-            echo "export RELEASE_TAG=${{ github.event.inputs.VERSION }}" > release_version.txt
-            git add release_version.txt
+            echo "export RELEASE_TAG=${{ github.event.inputs.VERSION }}" > scripts/release_version.txt
+            mkdir -p prebuilt_artifacts
+            cp backend/build/libs/rag-api.jar prebuilt_artifacts/
+            cp ui/fe-dist.tar.gz prebuilt_artifacts/
+            cp ui/express/node-dist.tar.gz prebuilt_artifacts/
+            git add prebuilt_artifacts
+            git add scripts/release_version.txt
             if ! git diff --cached --quiet; then
               git commit -m "Update release version to ${{ github.event.inputs.VERSION }}"
               git push
             else
               echo "No changes to commit"
             fi
-        working-directory: scripts
         env:
-          GITHUB_TOKEN: ${{ github.token }}
+          GITHUB_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/publish_runtime.yml b/.github/workflows/publish_runtime.yml
@@ -0,0 +1,41 @@
+name: publish_runtime.yml
+on:
+  workflow_dispatch:
+    inputs:
+      VERSION:
+        description: 'Version of runtime to release'
+        required: true
+      BRANCH:
+        description: 'Branch to use for runtime build'
+        required: true
+        default: 'main'
+        type: choice
+        options:
+          - main
+          - mob/main
+          - release/1
+          - customer-hotfix
+jobs:
+  runtime-build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.BRANCH }}
+          ssh-key: ${{ secrets.DEPLOY_KEY }}
+          lfs: true
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build Docker image
+        run: |
+          docker build -t ghcr.io/cloudera/rag-studio-runtime:${{ github.event.inputs.VERSION }} -t ghcr.io/cloudera/rag-studio-runtime:latest -f runtime.Dockerfile .
+          docker push ghcr.io/cloudera/rag-studio-runtime:${{ github.event.inputs.VERSION }}
+          docker push ghcr.io/cloudera/rag-studio-runtime:latest
+        working-directory: llm-service
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,4 @@ databases/
 **/.DS_Store
 .history
 addresses/
-tools/
+tools/mcp.json
diff --git a/README.md b/README.md
@@ -150,6 +150,18 @@ the Node service locally, you can do so by following these steps:
 docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/databases/qdrant_storage:/qdrant/storage:z qdrant/qdrant
 ```
 
+#### Modifying UI in CML
+
+* This is an unsupported workflow, but it is possible to modify the UI code in CML.
+
+- Start a CML Session from a CML Project that has the RAG Studio AMP installed.
+- Open the terminal in the CML Session and navigate to the `ui` directory.
+- Run `source ~/.bashrc` to ensure the Node environment variables are loaded.
+- Install PNPM using `npm install -g pnpm`.  Docs on PNPM can be found here: https://pnpm.io/installation#using-npm
+- Run `pnpm install` to install the dependencies.
+- Make your changes to the UI code in the `ui` directory.
+- Run `pnpm build` to build the new UI bundle.
+
 ## The Fine Print
 
 IMPORTANT: Please read the following before proceeding. This AMP includes or otherwise depends on certain third party software packages. Information about such third party software packages are made available in the notice file associated with this AMP. By configuring and launching this AMP, you will cause such third party software packages to be downloaded and installed into your environment, in some instances, from third parties' websites. For each third party software package, please see the notice file and the applicable websites for more information, including the applicable license terms. If you do not wish to download and install the third party software packages, do not configure, launch or otherwise use this AMP. By configuring, launching or otherwise using the AMP, you acknowledge the foregoing statement and agree that Cloudera is not responsible or liable in any way for the third party software packages.
diff --git a/backend/src/main/java/com/cloudera/cai/rag/projects/ProjectRepository.java b/backend/src/main/java/com/cloudera/cai/rag/projects/ProjectRepository.java
@@ -123,11 +123,11 @@ public List<Project> getProjects(String username) {
               SELECT *
                 FROM project
                 WHERE created_by_id = :createdById
-                OR default_project = true
+                OR default_project = :default
               """;
           handle.registerRowMapper(ConstructorMapper.factory(Project.class));
           try (Query query = handle.createQuery(sql)) {
-            query.bind("createdById", username);
+            query.bind("createdById", username).bind("default", true);
             return query.mapTo(Project.class).list();
           }
         });
@@ -145,11 +145,12 @@ public Project getDefaultProject() {
               """
                SELECT *
                  FROM project
-               WHERE default_project = true
+               WHERE default_project = :default
               """;
           handle.registerRowMapper(ConstructorMapper.factory(Project.class));
           try (Query query = handle.createQuery(sql)) {
             return query
+                .bind("default", true)
                 .mapTo(Project.class)
                 .findOne()
                 .orElseThrow(() -> new NotFound("Default project not found"));

diff --git a/backend/src/main/resources/migrations/postgres/19_create_project.up.sql b/backend/src/main/resources/migrations/postgres/19_create_project.up.sql
@@ -40,14 +40,13 @@ BEGIN;
 
 CREATE TABLE project
 (
-    id                    BIGINT auto_increment NOT NULL,
+    id SERIAL PRIMARY KEY,
     name                  VARCHAR(1024)         NOT NULL,
     default_project       BOOLEAN               NOT NULL DEFAULT FALSE,
     time_created          TIMESTAMP             NOT NULL DEFAULT CURRENT_TIMESTAMP,
     time_updated          TIMESTAMP             NOT NULL DEFAULT CURRENT_TIMESTAMP,
     created_by_id         VARCHAR(255)                NOT NULL,
-    updated_by_id         VARCHAR(255)                NOT NULL,
-    CONSTRAINT PK_project PRIMARY KEY (id)
+    updated_by_id         VARCHAR(255)                NOT NULL
 );
 
 COMMIT;
diff --git a/backend/src/main/resources/migrations/postgres/20_create_default_project.up.sql b/backend/src/main/resources/migrations/postgres/20_create_default_project.up.sql
@@ -38,6 +38,6 @@
 
 BEGIN;
 
-INSERT INTO project (name, created_by_id, updated_by_id) VALUES ('Default', 'admin', 'admin');
+INSERT INTO project (name, DEFAULT_PROJECT, created_by_id, updated_by_id) VALUES ('Default', true,'admin', 'admin');
 
 COMMIT;
diff --git a/backend/src/main/resources/migrations/postgres/23_create_project_data_source.up.sql b/backend/src/main/resources/migrations/postgres/23_create_project_data_source.up.sql
@@ -40,10 +40,9 @@ BEGIN;
 
 CREATE TABLE project_data_source
 (
-    id              BIGINT auto_increment NOT NULL,
+    id SERIAL PRIMARY KEY,
     project_id      BIGINT                NOT NULL,
-    data_source_id  BIGINT                NOT NULL,
-    CONSTRAINT PK_project_ds PRIMARY KEY (id)
+    data_source_id  BIGINT                NOT NULL
 );
 
 COMMIT;
diff --git a/backend/src/main/resources/migrations/postgres/24_add_default_project_data_source.down.sql b/backend/src/main/resources/migrations/postgres/24_add_default_project_data_source.down.sql
@@ -36,8 +36,6 @@
  * DATA.
  */
 
-SET MODE MYSQL;
-
 BEGIN;
 
 DELETE from project_data_source where project_id = (select id from project where default_project = true);

diff --git a/backend/src/main/resources/migrations/postgres/24_add_default_project_data_source.up.sql b/backend/src/main/resources/migrations/postgres/24_add_default_project_data_source.up.sql
@@ -36,8 +36,6 @@
  * DATA.
  */
 
-SET MODE MYSQL;
-
 BEGIN;
 
 INSERT INTO project_data_source (project_id, data_source_id)

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -53,6 +53,7 @@ services:
       - S3_RAG_DOCUMENT_BUCKET=cloudera-ai-rag-dev-us-west-2
       - QDRANT_HOST=qdrant
       - API_URL=http://api:8080
+      - MLFLOW_RECONCILER_DATA_PATH=/tmp
     depends_on:
       - qdrant
   db:

diff --git a/docs/allow_list.txt b/docs/allow_list.txt
@@ -4,13 +4,9 @@ NVM:
 https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh
 
 Node 22:
-https://nodejs.org/dist/v22.15.0/node-v22.15.0-darwin-arm64.tar.xz
+https://nodejs.org/dist/v22.15.0/node-v22.15.0-linux-x64.tar.gz
 
 RAG Studio artifacts:
-# note: these first 3 redirect to the specific release url (eg. releases/download/1.16.0/...)
-https://github.com/cloudera/CML_AMP_RAG_Studio/releases/latest/download/rag-api.jar
-https://github.com/cloudera/CML_AMP_RAG_Studio/releases/latest/download/fe-dist.tar.gz
-https://github.com/cloudera/CML_AMP_RAG_Studio/releases/latest/download/node-dist.tar.gz
 https://github.com/cloudera/CML_AMP_RAG_Studio/releases/download/model_download/craft_mlt_25k.pth
 https://github.com/cloudera/CML_AMP_RAG_Studio/releases/download/model_download/latin_g2.pth
 

diff --git a/llm-service/app/ai/indexing/base.py b/llm-service/app/ai/indexing/base.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from abc import abstractmethod
 from dataclasses import dataclass
@@ -6,13 +7,17 @@
 
 from .readers.base_reader import BaseReader, ReaderConfig
 from .readers.csv import CSVReader
+from .readers.docling_reader import DoclingReader
 from .readers.docx import DocxReader
 from .readers.images import ImagesReader
 from .readers.json import JSONReader
 from .readers.markdown import MdReader
 from .readers.pdf import PDFReader
 from .readers.pptx import PptxReader
 from .readers.simple_file import SimpleFileReader
+from ...config import settings
+
+logger = logging.getLogger(__name__)
 
 READERS: Dict[str, Type[BaseReader]] = {
     ".pdf": PDFReader,
@@ -29,6 +34,11 @@
     ".png": ImagesReader,
 }
 
+DOCLING_READERS: Dict[str, Type[BaseReader]] = {
+    ".pdf": DoclingReader,
+    ".html": DoclingReader,
+}
+
 
 @dataclass
 class NotSupportedFileExtensionError(Exception):
@@ -50,17 +60,19 @@ def index_file(self, file_path: Path, doc_id: str) -> None:
 
     def _get_reader_class(self, file_path: Path) -> Type[BaseReader]:
         file_extension = os.path.splitext(file_path)[1]
-        reader_cls = READERS.get(file_extension)
+        reader_cls: Optional[Type[BaseReader]] = None
+        if settings.advanced_pdf_parsing and DOCLING_READERS.get(file_extension):
+            try:
+                reader_cls = DoclingReader
+            except Exception as e:
+                logger.error(
+                    "Error initializing DoclingReader, falling back to default readers",
+                    e,
+                )
+                reader_cls = READERS.get(file_extension)
+        else:
+            reader_cls = READERS.get(file_extension)
         if not reader_cls:
             raise NotSupportedFileExtensionError(file_extension)
 
         return reader_cls
-
-
-def get_reader_class(file_path: Path) -> Type[BaseReader]:
-    file_extension = os.path.splitext(file_path)[1]
-    reader_cls = READERS.get(file_extension)
-    if not reader_cls:
-        raise NotSupportedFileExtensionError(file_extension)
-
-    return reader_cls
diff --git a/llm-service/app/ai/indexing/readers/docling_reader.py b/llm-service/app/ai/indexing/readers/docling_reader.py
@@ -0,0 +1,89 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import logging
+from pathlib import Path
+from typing import List, Any
+
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+from docling_core.transforms.chunker.base import BaseChunk
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from llama_index.core.schema import Document, TextNode, NodeRelationship
+
+from .base_reader import BaseReader
+from .base_reader import ChunksResult
+from .pdf import MarkdownSerializerProvider
+
+logger = logging.getLogger(__name__)
+
+class DoclingReader(BaseReader):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+    def load_chunks(self, file_path: Path) -> ChunksResult:
+        document = Document()
+        document.id_ = self.document_id
+        self._add_document_metadata(document, file_path)
+        parent = document.as_related_node_info()
+
+        converted_chunks: List[TextNode] = []
+        logger.debug(f"{file_path=}")
+        docling_doc: ConversionResult = DocumentConverter().convert(file_path)
+        chunky_chunks = HybridChunker(serializer_provider=MarkdownSerializerProvider()).chunk(docling_doc.document)
+        chunky_chunk: BaseChunk
+        for i, chunky_chunk in enumerate(chunky_chunks):
+            page_number: int = 0
+            if not hasattr(chunky_chunk.meta, "doc_items"):
+                logger.warning(f"Chunk {i} is empty, skipping")
+                continue
+            for item in chunky_chunk.meta.doc_items:
+                page_number= item.prov[0].page_no if item.prov else None
+            node = TextNode(text=chunky_chunk.text)
+            if page_number:
+                node.metadata["page_number"] = page_number
+            node.metadata["file_name"] = document.metadata["file_name"]
+            node.metadata["document_id"] = document.metadata["document_id"]
+            node.metadata["data_source_id"] = document.metadata["data_source_id"]
+            node.metadata["chunk_number"] = i
+            node.metadata["chunk_format"] = "markdown"
+            node.relationships.update(
+                {NodeRelationship.SOURCE: parent}
+            )
+            converted_chunks.append(node)
+        return ChunksResult(converted_chunks)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# prebuilt_artifacts/* filter=lfs diff=lfs merge=lfs -text
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,6 +38,6 @@ @@
     BEGIN;
-    INSERT INTO project (name, created_by_id, updated_by_id) VALUES ('Default', 'admin', 'admin');
+    INSERT INTO project (name, DEFAULT_PROJECT, created_by_id, updated_by_id) VALUES ('Default', true,'admin', 'admin');
     COMMIT;