Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 86 additions & 61 deletions src/ops/functions/split_recursively.rs
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ impl SimpleFunctionFactoryBase for Factory {

#[cfg(test)]
mod tests {
use super::*;
use super::*;

// Helper function to assert chunk text and its consistency with the range within the original text.
fn assert_chunk_text_consistency(
Expand All @@ -629,31 +629,43 @@ mod tests {
// Extract text using the chunk's range from the original full text.
let extracted_text = actual_chunk.0.extract_str(full_text);
// Assert that the expected text matches the text provided in the chunk.
assert_eq!(actual_chunk.1, expected_text, "Provided chunk text mismatch - {}", context);
assert_eq!(
actual_chunk.1, expected_text,
"Provided chunk text mismatch - {}",
context
);
// Assert that the expected text also matches the text extracted using the chunk's range.
assert_eq!(extracted_text, expected_text, "Range inconsistency: extracted text mismatch - {}", context);
assert_eq!(
extracted_text, expected_text,
"Range inconsistency: extracted text mismatch - {}",
context
);
}

// Creates a default RecursiveChunker for testing, assuming no language-specific parsing.
fn create_test_chunker(text: &str, chunk_size: usize, chunk_overlap: usize) -> RecursiveChunker {
fn create_test_chunker(
text: &str,
chunk_size: usize,
chunk_overlap: usize,
) -> RecursiveChunker {
RecursiveChunker {
full_text: text,
lang_config: None,
lang_config: None,
chunk_size,
chunk_overlap,
}
}

#[test]
fn test_translate_bytes_to_chars_simple() {
let text = "abc😄def";
let mut start1 = 0;
let mut end1 = 3;
let mut start2 = 3;
let mut end2 = 7;
let mut start3 = 7;
let mut end3 = 10;
let mut end_full = text.len();
let text = "abc😄def";
let mut start1 = 0;
let mut end1 = 3;
let mut start2 = 3;
let mut end2 = 7;
let mut start3 = 7;
let mut end3 = 10;
let mut end_full = text.len();

let offsets = vec![
&mut start1,
Expand All @@ -667,22 +679,24 @@ mod tests {

translate_bytes_to_chars(text, offsets.into_iter());

assert_eq!(start1, 0);
assert_eq!(end1, 3);
assert_eq!(start2, 3);
assert_eq!(end2, 4);
assert_eq!(start3, 4);
assert_eq!(end3, 7);
assert_eq!(end_full, 7);
assert_eq!(start1, 0);
assert_eq!(end1, 3);
assert_eq!(start2, 3);
assert_eq!(end2, 4);
assert_eq!(start3, 4);
assert_eq!(end3, 7);
assert_eq!(end_full, 7);
}

#[test]
fn test_basic_split_no_overlap() {
let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
let chunker = create_test_chunker(text, 15, 0);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });

let chunker = create_test_chunker(text, 15, 0);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
next_regexp_sep_id: 0,
});

assert!(result.is_ok());
let chunks = result.unwrap();

Expand All @@ -693,74 +707,85 @@ mod tests {

// Test splitting when chunk_size forces breaks within segments.
let text2 = "A very very long text that needs to be split.";
let chunker2 = create_test_chunker(text2, 20, 0);
let result2 = chunker2.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });

let chunker2 = create_test_chunker(text2, 20, 0);
let result2 = chunker2.split_root_chunk(ChunkKind::RegexpSepChunk {
next_regexp_sep_id: 0,
});

assert!(result2.is_ok());
let chunks2 = result2.unwrap();

// Expect multiple chunks, likely split by spaces due to chunk_size.
assert!(chunks2.len() > 1);
assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
assert!(chunks2[0].1.len() <= 20);
assert!(chunks2.len() > 1);
assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
assert!(chunks2[0].1.len() <= 20);
}
#[test]
fn test_basic_split_with_overlap() {
let text = "This is a test text that is a bit longer to see how the overlap works.";
let chunker = create_test_chunker(text, 20, 5);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });

let chunker = create_test_chunker(text, 20, 5);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
next_regexp_sep_id: 0,
});

assert!(result.is_ok());
let chunks = result.unwrap();
assert!(chunks.len() > 1);

assert!(chunks.len() > 1);

if chunks.len() >= 2 {
let _chunk1_text = chunks[0].1;
let _chunk2_text = chunks[1].1;
assert!(chunks[0].1.len() <= 25);

assert!(chunks[0].1.len() <= 25);
}
}
#[test]
fn test_split_trims_whitespace() {
let text = " \n First chunk. \n\n Second chunk with spaces at the end. \n";
let chunker = create_test_chunker(text, 30, 0);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });

let chunker = create_test_chunker(text, 30, 0);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
next_regexp_sep_id: 0,
});

assert!(result.is_ok());
let chunks = result.unwrap();

assert_eq!(chunks.len(), 3);

// Only assert chunk 0 using the new helper, as chunks 1 and 2 have shown inconsistent split points/content.
assert_chunk_text_consistency(text, &chunks[0], " \n First chunk.", "Whitespace Test, Chunk 0");

// TODO: Assertions for chunks[1] and chunks[2] are commented out because
// the exact split point between them (byte 48 or 49) and their resulting
// content ("...espacio"/"s al final." vs "...espacios"/"al final.")
// has proven inconsistent across test runs.
// This indicates a possible bug or non-deterministic behavior in the
// flush_small_chunks or process_sub_chunks logic that needs investigation
// in the main code.
assert_eq!(chunks.len(), 3);

assert_chunk_text_consistency(
text,
&chunks[0],
" \n First chunk.",
"Whitespace Test, Chunk 0",
);
assert_chunk_text_consistency(
text,
&chunks[1],
" Second chunk with spaces at",
"Whitespace Test, Chunk 1",
);
assert_chunk_text_consistency(text, &chunks[2], "the end.", "Whitespace Test, Chunk 2");
}
#[test]
fn test_split_discards_empty_chunks() {
let text = "Chunk 1.\n\n \n\nChunk 2.\n\n------\n\nChunk 3.";
let chunker = create_test_chunker(text, 10, 0);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });

let chunker = create_test_chunker(text, 10, 0);

let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
next_regexp_sep_id: 0,
});

assert!(result.is_ok());
let chunks = result.unwrap();

assert_eq!(chunks.len(), 3);

// Expect only the chunks with actual alphanumeric content.
assert_chunk_text_consistency(text, &chunks[0], "Chunk 1.", "Discard Test, Chunk 0");
assert_chunk_text_consistency(text, &chunks[1], "Chunk 2.", "Discard Test, Chunk 1");
assert_chunk_text_consistency(text, &chunks[2], "Chunk 3.", "Discard Test, Chunk 2");
assert_chunk_text_consistency(text, &chunks[1], "Chunk 2.", "Discard Test, Chunk 1");
assert_chunk_text_consistency(text, &chunks[2], "Chunk 3.", "Discard Test, Chunk 2");
}
}
Loading