Skip to content

Commit

Permalink
feature: add target_splits_per_chunk to upload file request
Browse files Browse the repository at this point in the history
  • Loading branch information
skeptrunedev committed May 24, 2024
1 parent d6830f3 commit 1d04599
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 82 deletions.
2 changes: 1 addition & 1 deletion search/src/components/Atoms/Tooltip.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export const Tooltip = (props: TooltipProps) => {
{props.body}
</div>
<Show when={show()}>
<div class="absolute z-10 inline-block w-[100px] -translate-x-[45%] translate-y-3 rounded bg-white p-2 text-center shadow-lg dark:bg-black">
<div class="absolute z-10 inline-block w-[300px] -translate-x-[45%] translate-y-3 rounded bg-white p-2 text-center shadow-lg dark:bg-black">
{props.tooltipText}
</div>
<div class="caret absolute h-4 w-4 translate-x-[2px] translate-y-2 rotate-45 transform bg-white dark:bg-shark-700" />
Expand Down
47 changes: 43 additions & 4 deletions search/src/components/UploadFile.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { Show, createSignal, useContext } from "solid-js";
import { DatasetAndUserContext } from "./Contexts/DatasetAndUserContext";
import { BiSolidFile } from "solid-icons/bi";
import { Tooltip } from "./Atoms/Tooltip";
import { BsInfoCircle } from "solid-icons/bs";

export const UploadFile = () => {
const datasetAndUserContext = useContext(DatasetAndUserContext);
Expand All @@ -14,18 +17,20 @@ export const UploadFile = () => {
const [errorText, setErrorText] = createSignal("");
const [submitted, setSubmitted] = createSignal(false);
const [timestamp, setTimestamp] = createSignal("");
const [splitDelimiters, setSplitDelimiters] = createSignal([".", "?", "\\n"]);
const [targetSplitsPerChunk, setTargetSplitsPerChunk] = createSignal(20);

const handleDragUpload = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
setFile(e.dataTransfer?.files[0]);
setFile(e.dataTransfer?.files[0] as any);
};
const handleDirectUpload = (e: Event & { target: HTMLInputElement }) => {
e.preventDefault();
e.stopPropagation();
setFile(e.target.files ? e.target.files[0] : undefined);
setFile(e.target.files ? (e.target.files[0] as any) : undefined);
};
const submitEvidence = async (e: Event) => {
const uploadFile = async (e: Event) => {
const currentDataset = $dataset?.();
if (!currentDataset) return;

Expand Down Expand Up @@ -61,6 +66,8 @@ export const UploadFile = () => {
file_name: file_name,
link: link(),
tag_set: tagSet().split(","),
split_delimiters: splitDelimiters(),
target_splits_per_chunk: targetSplitsPerChunk(),
};

if (timestamp()) {
Expand Down Expand Up @@ -129,6 +136,38 @@ export const UploadFile = () => {
}}
/>
</div>
<div class="flex flex-col space-y-2">
<div class="flex flex-row items-center space-x-2">
<div>Split Delimiters</div>
<Tooltip
body={<BsInfoCircle />}
tooltipText="Split delimiters is an optional field which allows you to specify the delimiters to use when splitting the file before chunking the text. If not specified, the default [.!?\n] are used to split into sentences. However, you may want to use spaces or other delimiters."
/>
</div>
<input
type="text"
placeholder="optional - separate with commas"
value={splitDelimiters().join(",")}
onInput={(e) => setSplitDelimiters(e.target.value.split(","))}
class="w-full rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
/>
</div>
<div class="flex flex-col space-y-2">
<div class="flex flex-row items-center space-x-2">
<div>Target Splits Per Chunk</div>
<Tooltip
body={<BsInfoCircle />}
tooltipText="Target splits per chunk. This is an optional field which allows you to specify the number of splits you want per chunk. If not specified, the default 20 is used. However, you may want to use a different number. Trieve will evenly distribute remainder splits across chunks such that 46 splits with a target_splits_per_chunk of 20 will result in 3 chunks with 22 splits each."
/>
</div>
<input
type="number"
placeholder="optional"
value={targetSplitsPerChunk()}
onInput={(e) => setTargetSplitsPerChunk(parseInt(e.target.value))}
class="w-full rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
/>
</div>
<label
for="dropzone-file"
class="dark:hover:bg-bray-800 flex h-64 w-full cursor-pointer flex-col items-center justify-center rounded-lg border-2 border-dashed border-gray-300 bg-neutral-100 hover:bg-neutral-200 dark:border-gray-600 dark:bg-neutral-700 dark:hover:border-gray-500 dark:hover:bg-gray-600"
Expand Down Expand Up @@ -189,7 +228,7 @@ export const UploadFile = () => {
class="w-fit rounded bg-neutral-100 p-2 hover:bg-neutral-100 dark:bg-neutral-700 dark:hover:bg-neutral-800"
type="submit"
disabled={isSubmitting()}
onClick={(e) => void submitEvidence(e)}
onClick={(e) => void uploadFile(e)}
>
<Show when={!isSubmitting()}>Upload and Chunk New File</Show>
<Show when={isSubmitting()}>
Expand Down
4 changes: 2 additions & 2 deletions server/src/bin/file-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use trieve_server::{
data::models::{self, FileWorkerMessage},
errors::ServiceError,
establish_connection, get_env,
operators::file_operator::{create_chunks_with_handler, create_file_query, get_aws_bucket},
operators::file_operator::{create_file_chunks, create_file_query, get_aws_bucket},
};

fn main() {
Expand Down Expand Up @@ -327,7 +327,7 @@ async fn upload_file(
"Queue chunks for creation for file",
);

create_chunks_with_handler(
create_file_chunks(
created_file.id,
file_worker_message.upload_file_data,
html_content,
Expand Down
2 changes: 1 addition & 1 deletion server/src/bin/ingestion-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ async fn upload_chunk(
} else {
match payload.chunk.split_avg.unwrap_or(false) {
true => {
let chunks = coarse_doc_chunker(content.clone(), None);
let chunks = coarse_doc_chunker(content.clone(), None, 20);

let embeddings = create_embeddings(
chunks,
Expand Down
42 changes: 2 additions & 40 deletions server/src/data/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::get_env;
use crate::operators::parse_operator::convert_html_to_text;

use super::schema::*;
use crate::handlers::file_handler::UploadFileData;
use crate::handlers::file_handler::UploadFileReqPayload;
use crate::operators::search_operator::{
get_group_metadata_filter_condition, get_group_tag_set_filter_condition,
get_metadata_filter_condition,
Expand Down Expand Up @@ -2302,48 +2302,10 @@ impl From<RetrievedPoint> for QdrantPayload {
pub struct FileWorkerMessage {
pub file_id: uuid::Uuid,
pub dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
pub upload_file_data: FileDataDTO,
pub upload_file_data: UploadFileReqPayload,
pub attempt_number: u8,
}

#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)]
pub struct FileDataDTO {
/// Name of the file being uploaded, including the extension.
pub file_name: String,
/// Tag set is a comma separated list of tags which will be passed down to the chunks made from the file. Tags are used to filter chunks when searching. HNSW indices are created for each tag such that there is no performance loss when filtering on them.
pub tag_set: Option<Vec<String>>,
/// Description is an optional convience field so you do not have to remember what the file contains or is about. It will be included on the group resulting from the file which will hold its chunk.
pub description: Option<String>,
/// Link to the file. This can also be any string. This can be used to filter when searching for the file's resulting chunks. The link value will not affect embedding creation.
pub link: Option<String>,
/// Time stamp should be an ISO 8601 combined date and time without timezone. Time_stamp is used for time window filtering and recency-biasing search results. Will be passed down to the file's chunks.
pub time_stamp: Option<String>,
/// Metadata is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata. Will be passed down to the file's chunks.
pub metadata: Option<serde_json::Value>,
/// Create chunks is a boolean which determines whether or not to create chunks from the file. If false, you can manually chunk the file and send the chunks to the create_chunk endpoint with the file_id to associate chunks with the file. Meant mostly for advanced users.
pub create_chunks: Option<bool>,
/// Chunk delimiters is an optional field which allows you to specify the delimiters to use when chunking the file. If not specified, the default delimiters are used.
pub chunk_delimiters: Option<Vec<String>>,
/// Group tracking id is an optional field which allows you to specify the tracking id of the group that is created from the file. Chunks created will be created with the tracking id of `group_tracking_id|<index of chunk>`
pub group_tracking_id: Option<String>,
}

impl From<UploadFileData> for FileDataDTO {
fn from(upload_file_data: UploadFileData) -> Self {
FileDataDTO {
file_name: upload_file_data.file_name,
tag_set: upload_file_data.tag_set,
description: upload_file_data.description,
link: upload_file_data.link,
time_stamp: upload_file_data.time_stamp,
metadata: upload_file_data.metadata,
create_chunks: upload_file_data.create_chunks,
chunk_delimiters: upload_file_data.chunk_delimiters,
group_tracking_id: upload_file_data.group_tracking_id,
}
}
}

#[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
#[serde(untagged)]
pub enum RangeCondition {
Expand Down
28 changes: 17 additions & 11 deletions server/src/handlers/file_handler.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use super::auth_handler::{AdminOnly, LoggedUser};
use crate::{
data::models::{
DatasetAndOrgWithSubAndPlan, File, FileAndGroupId, FileDataDTO, FileWorkerMessage, Pool,
RedisPool, ServerDatasetConfiguration,
DatasetAndOrgWithSubAndPlan, File, FileAndGroupId, FileWorkerMessage, Pool, RedisPool,
ServerDatasetConfiguration,
},
errors::ServiceError,
operators::{
Expand Down Expand Up @@ -57,10 +57,11 @@ pub fn validate_file_name(s: String) -> Result<String, actix_web::Error> {
"key2": "value2"
},
"create_chunks": true,
"chunk_delimiters": [",",".","\n"]
"split_delimiters": [",",".","\n"],
"target_splits_per_chunk": 20,
}))]
pub struct UploadFileData {
/// Base64 encoded file. Convert + to -, / to _, and remove the ending = if present. This is the standard base64url encoding.
pub struct UploadFileReqPayload {
/// Base64 encoded file. This is the standard base64url encoding.
pub base64_file: String,
/// Name of the file being uploaded, including the extension.
pub file_name: String,
Expand All @@ -76,8 +77,10 @@ pub struct UploadFileData {
pub metadata: Option<serde_json::Value>,
/// Create chunks is a boolean which determines whether or not to create chunks from the file. If false, you can manually chunk the file and send the chunks to the create_chunk endpoint with the file_id to associate chunks with the file. Meant mostly for advanced users.
pub create_chunks: Option<bool>,
/// Chunk delimiters is an optional field which allows you to specify the delimiters to use when chunking the file. If not specified, the default delimiters are used.
pub chunk_delimiters: Option<Vec<String>>,
/// Split delimiters is an optional field which allows you to specify the delimiters to use when splitting the file before chunking the text. If not specified, the default [.!?\n] are used to split into sentences. However, you may want to use spaces or other delimiters.
pub split_delimiters: Option<Vec<String>>,
/// Target splits per chunk. This is an optional field which allows you to specify the number of splits you want per chunk. If not specified, the default 20 is used. However, you may want to use a different number. Trieve will evenly distribute remainder splits across chunks such that 66 splits with a `target_splits_per_chunk` of 20 will result in 3 chunks with 22 splits each.
pub target_splits_per_chunk: Option<i32>,
/// Group tracking id is an optional field which allows you to specify the tracking id of the group that is created from the file. Chunks created will be created with the tracking id of `group_tracking_id|<index of chunk>`
pub group_tracking_id: Option<String>,
}
Expand All @@ -95,7 +98,7 @@ pub struct UploadFileResult {
path = "/file",
context_path = "/api",
tag = "file",
request_body(content = UploadFileData, description = "JSON request payload to upload a file", content_type = "application/json"),
request_body(content = UploadFileReqPayload, description = "JSON request payload to upload a file", content_type = "application/json"),
responses(
(status = 200, description = "Confirmation that the file is uploading", body = UploadFileResult),
(status = 400, description = "Service error relating to uploading the file", body = ErrorResponseBody),
Expand All @@ -109,7 +112,7 @@ pub struct UploadFileResult {
)]
#[tracing::instrument(skip(pool))]
pub async fn upload_file_handler(
data: web::Json<UploadFileData>,
data: web::Json<UploadFileReqPayload>,
pool: web::Data<Pool>,
user: AdminOnly,
dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
Expand Down Expand Up @@ -160,6 +163,10 @@ pub async fn upload_file_handler(
let upload_file_data = data.into_inner();

let base64_decode_span = transaction.start_child("base64_decode", "base64_decode");
let mut cleaned_base64 = upload_file_data.base64_file.replace("+", "-").replace("/", "_");
if cleaned_base64.ends_with('=') {
cleaned_base64.pop();
}
let base64_engine = engine::GeneralPurpose::new(&alphabet::URL_SAFE, general_purpose::NO_PAD);

let decoded_file_data = base64_engine
Expand All @@ -183,11 +190,10 @@ pub async fn upload_file_handler(

bucket_upload_span.finish();

let file_data: FileDataDTO = upload_file_data.clone().into();
let message = FileWorkerMessage {
file_id,
dataset_org_plan_sub: dataset_org_plan_sub.clone(),
upload_file_data: file_data,
upload_file_data: upload_file_data.clone(),
attempt_number: 0,
};

Expand Down
10 changes: 5 additions & 5 deletions server/src/handlers/message_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -862,8 +862,8 @@ pub async fn stream_response(
}

#[derive(Deserialize, Serialize, Debug, ToSchema)]
pub struct SuggestedQueriesRequest {
/// The query to base the generated suggested queries off of.
pub struct SuggestedQueriesReqPayload {
/// The query to base the generated suggested queries off of using RAG. A hybrid search for 10 chunks from your dataset using this query will be performed and the context of the chunks will be used to generate the suggested queries.
pub query: String,
}

Expand All @@ -874,13 +874,13 @@ pub struct SuggestedQueriesResponse {

/// Generate suggested queries
///
/// This endpoint will generate 3 suggested queries based off the query provided in the request body and return them as a JSON object.
/// This endpoint will generate 3 suggested queries based off a hybrid search using RAG with the query provided in the request body and return them as a JSON object.
#[utoipa::path(
post,
path = "/chunk/gen_suggestions",
context_path = "/api",
tag = "chunk",
request_body(content = SuggestedQueriesRequest, description = "JSON request payload to get alternative suggested queries", content_type = "application/json"),
request_body(content = SuggestedQueriesReqPayload, description = "JSON request payload to get alternative suggested queries", content_type = "application/json"),
responses(
(status = 200, description = "A JSON object containing a list of alternative suggested queries", body = SuggestedQueriesResponse),
(status = 400, description = "Service error relating to to updating chunk, likely due to conflicting tracking_id", body = ErrorResponseBody),
Expand All @@ -895,7 +895,7 @@ pub struct SuggestedQueriesResponse {
)]
#[tracing::instrument(skip(pool))]
pub async fn create_suggested_queries_handler(
data: web::Json<SuggestedQueriesRequest>,
data: web::Json<SuggestedQueriesReqPayload>,
dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
pool: web::Data<Pool>,
_required_user: LoggedUser,
Expand Down
6 changes: 3 additions & 3 deletions server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ impl Modify for SecurityAddon {
name = "BSL",
url = "https://github.com/devflowinc/trieve/blob/main/LICENSE.txt",
),
version = "0.8.7",
version = "0.8.8",
),
servers(
(url = "https://api.trieve.ai",
Expand Down Expand Up @@ -222,7 +222,7 @@ impl Modify for SecurityAddon {
handlers::message_handler::CreateMessageData,
handlers::message_handler::RegenerateMessageData,
handlers::message_handler::EditMessageData,
handlers::message_handler::SuggestedQueriesRequest,
handlers::message_handler::SuggestedQueriesReqPayload,
handlers::message_handler::SuggestedQueriesResponse,
handlers::chunk_handler::ChunkData,
handlers::chunk_handler::CreateChunkData,
Expand Down Expand Up @@ -263,7 +263,7 @@ impl Modify for SecurityAddon {
handlers::group_handler::UpdateGroupByTrackingIDData,
handlers::group_handler::AddChunkToGroupData,
operators::group_operator::BookmarkGroupResult,
handlers::file_handler::UploadFileData,
handlers::file_handler::UploadFileReqPayload,
handlers::file_handler::UploadFileResult,
handlers::invitation_handler::InvitationData,
handlers::event_handler::GetEventsData,
Expand Down
Loading

0 comments on commit 1d04599

Please sign in to comment.