feature: add target_splits_per_chunk to upload file request

devflowinc · May 24, 2024 · 1d04599 · 1d04599
1 parent d6830f3
commit 1d04599
Show file tree

Hide file tree

Showing 10 changed files with 96 additions and 82 deletions.
diff --git a/search/src/components/Atoms/Tooltip.tsx b/search/src/components/Atoms/Tooltip.tsx
@@ -19,7 +19,7 @@ export const Tooltip = (props: TooltipProps) => {
         {props.body}
       </div>
       <Show when={show()}>
-        <div class="absolute z-10 inline-block w-[100px] -translate-x-[45%] translate-y-3 rounded bg-white p-2 text-center shadow-lg dark:bg-black">
+        <div class="absolute z-10 inline-block w-[300px] -translate-x-[45%] translate-y-3 rounded bg-white p-2 text-center shadow-lg dark:bg-black">
           {props.tooltipText}
         </div>
         <div class="caret absolute h-4 w-4 translate-x-[2px] translate-y-2 rotate-45 transform bg-white dark:bg-shark-700" />

diff --git a/search/src/components/UploadFile.tsx b/search/src/components/UploadFile.tsx
@@ -1,6 +1,9 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { Show, createSignal, useContext } from "solid-js";
 import { DatasetAndUserContext } from "./Contexts/DatasetAndUserContext";
 import { BiSolidFile } from "solid-icons/bi";
+import { Tooltip } from "./Atoms/Tooltip";
+import { BsInfoCircle } from "solid-icons/bs";
 
 export const UploadFile = () => {
   const datasetAndUserContext = useContext(DatasetAndUserContext);
@@ -14,18 +17,20 @@ export const UploadFile = () => {
   const [errorText, setErrorText] = createSignal("");
   const [submitted, setSubmitted] = createSignal(false);
   const [timestamp, setTimestamp] = createSignal("");
+  const [splitDelimiters, setSplitDelimiters] = createSignal([".", "?", "\\n"]);
+  const [targetSplitsPerChunk, setTargetSplitsPerChunk] = createSignal(20);
 
   const handleDragUpload = (e: DragEvent) => {
     e.preventDefault();
     e.stopPropagation();
-    setFile(e.dataTransfer?.files[0]);
+    setFile(e.dataTransfer?.files[0] as any);
   };
   const handleDirectUpload = (e: Event & { target: HTMLInputElement }) => {
     e.preventDefault();
     e.stopPropagation();
-    setFile(e.target.files ? e.target.files[0] : undefined);
+    setFile(e.target.files ? (e.target.files[0] as any) : undefined);
   };
-  const submitEvidence = async (e: Event) => {
+  const uploadFile = async (e: Event) => {
     const currentDataset = $dataset?.();
     if (!currentDataset) return;
 
@@ -61,6 +66,8 @@ export const UploadFile = () => {
       file_name: file_name,
       link: link(),
       tag_set: tagSet().split(","),
+      split_delimiters: splitDelimiters(),
+      target_splits_per_chunk: targetSplitsPerChunk(),
     };
 
     if (timestamp()) {
@@ -129,6 +136,38 @@ export const UploadFile = () => {
             }}
           />
         </div>
+        <div class="flex flex-col space-y-2">
+          <div class="flex flex-row items-center space-x-2">
+            <div>Split Delimiters</div>
+            <Tooltip
+              body={<BsInfoCircle />}
+              tooltipText="Split delimiters is an optional field which allows you to specify the delimiters to use when splitting the file before chunking the text. If not specified, the default [.!?\n] are used to split into sentences. However, you may want to use spaces or other delimiters."
+            />
+          </div>
+          <input
+            type="text"
+            placeholder="optional - separate with commas"
+            value={splitDelimiters().join(",")}
+            onInput={(e) => setSplitDelimiters(e.target.value.split(","))}
+            class="w-full rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
+          />
+        </div>
+        <div class="flex flex-col space-y-2">
+          <div class="flex flex-row items-center space-x-2">
+            <div>Target Splits Per Chunk</div>
+            <Tooltip
+              body={<BsInfoCircle />}
+              tooltipText="Target splits per chunk. This is an optional field which allows you to specify the number of splits you want per chunk. If not specified, the default 20 is used. However, you may want to use a different number. Trieve will evenly distribute remainder splits across chunks such that 46 splits with a target_splits_per_chunk of 20 will result in 3 chunks with 22 splits each."
+            />
+          </div>
+          <input
+            type="number"
+            placeholder="optional"
+            value={targetSplitsPerChunk()}
+            onInput={(e) => setTargetSplitsPerChunk(parseInt(e.target.value))}
+            class="w-full rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
+          />
+        </div>
         <label
           for="dropzone-file"
           class="dark:hover:bg-bray-800 flex h-64 w-full cursor-pointer flex-col items-center justify-center rounded-lg border-2 border-dashed border-gray-300 bg-neutral-100 hover:bg-neutral-200 dark:border-gray-600 dark:bg-neutral-700 dark:hover:border-gray-500 dark:hover:bg-gray-600"
@@ -189,7 +228,7 @@ export const UploadFile = () => {
             class="w-fit rounded bg-neutral-100 p-2 hover:bg-neutral-100 dark:bg-neutral-700 dark:hover:bg-neutral-800"
             type="submit"
             disabled={isSubmitting()}
-            onClick={(e) => void submitEvidence(e)}
+            onClick={(e) => void uploadFile(e)}
           >
             <Show when={!isSubmitting()}>Upload and Chunk New File</Show>
             <Show when={isSubmitting()}>

diff --git a/server/src/bin/file-worker.rs b/server/src/bin/file-worker.rs
@@ -11,7 +11,7 @@ use trieve_server::{
     data::models::{self, FileWorkerMessage},
     errors::ServiceError,
     establish_connection, get_env,
-    operators::file_operator::{create_chunks_with_handler, create_file_query, get_aws_bucket},
+    operators::file_operator::{create_file_chunks, create_file_query, get_aws_bucket},
 };
 
 fn main() {
@@ -327,7 +327,7 @@ async fn upload_file(
         "Queue chunks for creation for file",
     );
 
-    create_chunks_with_handler(
+    create_file_chunks(
         created_file.id,
         file_worker_message.upload_file_data,
         html_content,

diff --git a/server/src/bin/ingestion-worker.rs b/server/src/bin/ingestion-worker.rs
@@ -688,7 +688,7 @@ async fn upload_chunk(
     } else {
         match payload.chunk.split_avg.unwrap_or(false) {
             true => {
-                let chunks = coarse_doc_chunker(content.clone(), None);
+                let chunks = coarse_doc_chunker(content.clone(), None, 20);
 
                 let embeddings = create_embeddings(
                     chunks,

diff --git a/server/src/data/models.rs b/server/src/data/models.rs
@@ -7,7 +7,7 @@ use crate::get_env;
 use crate::operators::parse_operator::convert_html_to_text;
 
 use super::schema::*;
-use crate::handlers::file_handler::UploadFileData;
+use crate::handlers::file_handler::UploadFileReqPayload;
 use crate::operators::search_operator::{
     get_group_metadata_filter_condition, get_group_tag_set_filter_condition,
     get_metadata_filter_condition,
@@ -2302,48 +2302,10 @@ impl From<RetrievedPoint> for QdrantPayload {
 pub struct FileWorkerMessage {
     pub file_id: uuid::Uuid,
     pub dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
-    pub upload_file_data: FileDataDTO,
+    pub upload_file_data: UploadFileReqPayload,
     pub attempt_number: u8,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)]
-pub struct FileDataDTO {
-    /// Name of the file being uploaded, including the extension.
-    pub file_name: String,
-    /// Tag set is a comma separated list of tags which will be passed down to the chunks made from the file. Tags are used to filter chunks when searching. HNSW indices are created for each tag such that there is no performance loss when filtering on them.
-    pub tag_set: Option<Vec<String>>,
-    /// Description is an optional convience field so you do not have to remember what the file contains or is about. It will be included on the group resulting from the file which will hold its chunk.
-    pub description: Option<String>,
-    /// Link to the file. This can also be any string. This can be used to filter when searching for the file's resulting chunks. The link value will not affect embedding creation.
-    pub link: Option<String>,
-    /// Time stamp should be an ISO 8601 combined date and time without timezone. Time_stamp is used for time window filtering and recency-biasing search results. Will be passed down to the file's chunks.
-    pub time_stamp: Option<String>,
-    /// Metadata is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata. Will be passed down to the file's chunks.
-    pub metadata: Option<serde_json::Value>,
-    /// Create chunks is a boolean which determines whether or not to create chunks from the file. If false, you can manually chunk the file and send the chunks to the create_chunk endpoint with the file_id to associate chunks with the file. Meant mostly for advanced users.
-    pub create_chunks: Option<bool>,
-    /// Chunk delimiters is an optional field which allows you to specify the delimiters to use when chunking the file. If not specified, the default delimiters are used.
-    pub chunk_delimiters: Option<Vec<String>>,
-    /// Group tracking id is an optional field which allows you to specify the tracking id of the group that is created from the file. Chunks created will be created with the tracking id of `group_tracking_id|<index of chunk>`
-    pub group_tracking_id: Option<String>,
-}
-
-impl From<UploadFileData> for FileDataDTO {
-    fn from(upload_file_data: UploadFileData) -> Self {
-        FileDataDTO {
-            file_name: upload_file_data.file_name,
-            tag_set: upload_file_data.tag_set,
-            description: upload_file_data.description,
-            link: upload_file_data.link,
-            time_stamp: upload_file_data.time_stamp,
-            metadata: upload_file_data.metadata,
-            create_chunks: upload_file_data.create_chunks,
-            chunk_delimiters: upload_file_data.chunk_delimiters,
-            group_tracking_id: upload_file_data.group_tracking_id,
-        }
-    }
-}
-
 #[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
 #[serde(untagged)]
 pub enum RangeCondition {

diff --git a/server/src/handlers/file_handler.rs b/server/src/handlers/file_handler.rs
@@ -1,8 +1,8 @@
 use super::auth_handler::{AdminOnly, LoggedUser};
 use crate::{
     data::models::{
-        DatasetAndOrgWithSubAndPlan, File, FileAndGroupId, FileDataDTO, FileWorkerMessage, Pool,
-        RedisPool, ServerDatasetConfiguration,
+        DatasetAndOrgWithSubAndPlan, File, FileAndGroupId, FileWorkerMessage, Pool, RedisPool,
+        ServerDatasetConfiguration,
     },
     errors::ServiceError,
     operators::{
@@ -57,10 +57,11 @@ pub fn validate_file_name(s: String) -> Result<String, actix_web::Error> {
         "key2": "value2"
     },
     "create_chunks": true,
-    "chunk_delimiters": [",",".","\n"]
+    "split_delimiters": [",",".","\n"],
+    "target_splits_per_chunk": 20,
 }))]
-pub struct UploadFileData {
-    /// Base64 encoded file. Convert + to -, / to _, and remove the ending = if present. This is the standard base64url encoding.
+pub struct UploadFileReqPayload {
+    /// Base64 encoded file. This is the standard base64url encoding.
     pub base64_file: String,
     /// Name of the file being uploaded, including the extension.
     pub file_name: String,
@@ -76,8 +77,10 @@ pub struct UploadFileData {
     pub metadata: Option<serde_json::Value>,
     /// Create chunks is a boolean which determines whether or not to create chunks from the file. If false, you can manually chunk the file and send the chunks to the create_chunk endpoint with the file_id to associate chunks with the file. Meant mostly for advanced users.
     pub create_chunks: Option<bool>,
-    /// Chunk delimiters is an optional field which allows you to specify the delimiters to use when chunking the file. If not specified, the default delimiters are used.
-    pub chunk_delimiters: Option<Vec<String>>,
+    /// Split delimiters is an optional field which allows you to specify the delimiters to use when splitting the file before chunking the text. If not specified, the default [.!?\n] are used to split into sentences. However, you may want to use spaces or other delimiters.
+    pub split_delimiters: Option<Vec<String>>,
+    /// Target splits per chunk. This is an optional field which allows you to specify the number of splits you want per chunk. If not specified, the default 20 is used. However, you may want to use a different number. Trieve will evenly distribute remainder splits across chunks such that 66 splits with a `target_splits_per_chunk` of 20 will result in 3 chunks with 22 splits each.
+    pub target_splits_per_chunk: Option<i32>,
     /// Group tracking id is an optional field which allows you to specify the tracking id of the group that is created from the file. Chunks created will be created with the tracking id of `group_tracking_id|<index of chunk>`
     pub group_tracking_id: Option<String>,
 }
@@ -95,7 +98,7 @@ pub struct UploadFileResult {
     path = "/file",
     context_path = "/api",
     tag = "file",
-    request_body(content = UploadFileData, description = "JSON request payload to upload a file", content_type = "application/json"),
+    request_body(content = UploadFileReqPayload, description = "JSON request payload to upload a file", content_type = "application/json"),
     responses(
         (status = 200, description = "Confirmation that the file is uploading", body = UploadFileResult),
         (status = 400, description = "Service error relating to uploading the file", body = ErrorResponseBody),
@@ -109,7 +112,7 @@ pub struct UploadFileResult {
 )]
 #[tracing::instrument(skip(pool))]
 pub async fn upload_file_handler(
-    data: web::Json<UploadFileData>,
+    data: web::Json<UploadFileReqPayload>,
     pool: web::Data<Pool>,
     user: AdminOnly,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
@@ -160,6 +163,10 @@ pub async fn upload_file_handler(
     let upload_file_data = data.into_inner();
 
     let base64_decode_span = transaction.start_child("base64_decode", "base64_decode");
+    let mut cleaned_base64 = upload_file_data.base64_file.replace("+", "-").replace("/", "_");
+    if cleaned_base64.ends_with('=') {
+        cleaned_base64.pop();
+    }
     let base64_engine = engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD);
 
     let decoded_file_data = base64_engine
@@ -183,11 +190,10 @@ pub async fn upload_file_handler(
 
     bucket_upload_span.finish();
 
-    let file_data: FileDataDTO = upload_file_data.clone().into();
     let message = FileWorkerMessage {
         file_id,
         dataset_org_plan_sub: dataset_org_plan_sub.clone(),
-        upload_file_data: file_data,
+        upload_file_data: upload_file_data.clone(),
         attempt_number: 0,
     };
 

diff --git a/server/src/handlers/message_handler.rs b/server/src/handlers/message_handler.rs
@@ -862,8 +862,8 @@ pub async fn stream_response(
 }
 
 #[derive(Deserialize, Serialize, Debug, ToSchema)]
-pub struct SuggestedQueriesRequest {
-    /// The query to base the generated suggested queries off of.
+pub struct SuggestedQueriesReqPayload {
+    /// The query to base the generated suggested queries off of using RAG. A hybrid search for 10 chunks from your dataset using this query will be performed and the context of the chunks will be used to generate the suggested queries.
     pub query: String,
 }
 
@@ -874,13 +874,13 @@ pub struct SuggestedQueriesResponse {
 
 /// Generate suggested queries
 ///
-/// This endpoint will generate 3 suggested queries based off the query provided in the request body and return them as a JSON object.
+/// This endpoint will generate 3 suggested queries based off a hybrid search using RAG with the query provided in the request body and return them as a JSON object.
 #[utoipa::path(
     post,
     path = "/chunk/gen_suggestions",
     context_path = "/api",
     tag = "chunk",
-    request_body(content = SuggestedQueriesRequest, description = "JSON request payload to get alternative suggested queries", content_type = "application/json"),
+    request_body(content = SuggestedQueriesReqPayload, description = "JSON request payload to get alternative suggested queries", content_type = "application/json"),
     responses(
         (status = 200, description = "A JSON object containing a list of alternative suggested queries", body = SuggestedQueriesResponse),
         (status = 400, description = "Service error relating to to updating chunk, likely due to conflicting tracking_id", body = ErrorResponseBody),
@@ -895,7 +895,7 @@ pub struct SuggestedQueriesResponse {
 )]
 #[tracing::instrument(skip(pool))]
 pub async fn create_suggested_queries_handler(
-    data: web::Json<SuggestedQueriesRequest>,
+    data: web::Json<SuggestedQueriesReqPayload>,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
     pool: web::Data<Pool>,
     _required_user: LoggedUser,

diff --git a/server/src/lib.rs b/server/src/lib.rs
@@ -133,7 +133,7 @@ impl Modify for SecurityAddon {
             name = "BSL",
             url = "https://github.com/devflowinc/trieve/blob/main/LICENSE.txt",
         ),
-        version = "0.8.7",
+        version = "0.8.8",
     ),
     servers(
         (url = "https://api.trieve.ai",
@@ -222,7 +222,7 @@ impl Modify for SecurityAddon {
             handlers::message_handler::CreateMessageData,
             handlers::message_handler::RegenerateMessageData,
             handlers::message_handler::EditMessageData,
-            handlers::message_handler::SuggestedQueriesRequest,
+            handlers::message_handler::SuggestedQueriesReqPayload,
             handlers::message_handler::SuggestedQueriesResponse,
             handlers::chunk_handler::ChunkData,
             handlers::chunk_handler::CreateChunkData,
@@ -263,7 +263,7 @@ impl Modify for SecurityAddon {
             handlers::group_handler::UpdateGroupByTrackingIDData,
             handlers::group_handler::AddChunkToGroupData,
             operators::group_operator::BookmarkGroupResult,
-            handlers::file_handler::UploadFileData,
+            handlers::file_handler::UploadFileReqPayload,
             handlers::file_handler::UploadFileResult,
             handlers::invitation_handler::InvitationData,
             handlers::event_handler::GetEventsData,