Skip to content

Commit c4ce324

Browse files
committed
perf: searchset microoptimize for perf
- shared regex_label - prealloc match_list capacity - removed unncessary `matched` var -removed `.contains` and refactored to directly iterate matches - optimized string building with `--flag` option - early exit in `--quick` mode
1 parent 5f50f23 commit c4ce324

File tree

1 file changed

+42
-19
lines changed

1 file changed

+42
-19
lines changed

src/cmd/searchset.rs

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,10 @@ Common options:
9696
use std::{
9797
fs::{self, File},
9898
io::{self, BufRead, BufReader},
99-
sync::Arc,
99+
sync::{
100+
Arc,
101+
atomic::{AtomicBool, Ordering},
102+
},
100103
};
101104

102105
use crossbeam_channel;
@@ -428,21 +431,27 @@ impl Args {
428431
true
429432
});
430433

431-
// Wrap pattern in Arc for sharing across threads
434+
// Wrap pattern and regex_labels in Arc for sharing across threads
432435
let pattern = Arc::new(pattern);
436+
let regex_labels = Arc::new(regex_labels.to_vec());
433437
let invert_match = self.flag_invert_match;
438+
let flag_quick = self.flag_quick;
439+
440+
// Atomic flag for early termination in quick mode
441+
let match_found = Arc::new(AtomicBool::new(false));
434442

435443
// Create thread pool and channel
436444
let pool = ThreadPool::new(njobs);
437445
let (send, recv) = crossbeam_channel::bounded(nchunks);
438446

439447
// Spawn search jobs
440448
for i in 0..nchunks {
441-
let (send, args, sel, pattern) = (
449+
let (send, args, sel, pattern, match_found_flag) = (
442450
send.clone(),
443451
self.clone(),
444452
sel.clone(),
445453
Arc::clone(&pattern),
454+
Arc::clone(&match_found),
446455
);
447456
pool.execute(move || {
448457
// safety: we know the file is indexed and seekable
@@ -454,22 +463,21 @@ impl Args {
454463
let mut row_number = (i * chunk_size) as u64 + 1; // 1-based row numbering
455464

456465
for record in it.flatten() {
457-
let mut matched = false;
458-
let mut match_list = Vec::new();
466+
// Early exit for quick mode if match already found by another thread
467+
if flag_quick && match_found_flag.load(Ordering::Relaxed) {
468+
break;
469+
}
470+
471+
let mut match_list = Vec::with_capacity(pattern.len());
459472

460473
// Check if any field matches
461474
let row_matched = sel.select(&record).any(|f| {
462475
let is_match = pattern.is_match(f);
463476
if is_match && do_match_list {
464-
let matches: Vec<usize> = pattern.matches(f).into_iter().collect();
465-
for m in matches {
466-
let adjusted = m + 1; // 1-based for human readability
467-
if !match_list.contains(&adjusted) {
468-
match_list.push(adjusted);
469-
}
477+
for m in pattern.matches(f) {
478+
match_list.push(m + 1); // 1-based for human readability
470479
}
471480
}
472-
matched = matched || is_match;
473481
is_match
474482
});
475483

@@ -479,13 +487,23 @@ impl Args {
479487
row_matched
480488
};
481489

490+
// Set flag if we found a match in quick mode
491+
if flag_quick && final_matched {
492+
match_found_flag.store(true, Ordering::Relaxed);
493+
}
494+
482495
results.push(SearchSetResult {
483496
row_number,
484497
record,
485498
matched: final_matched,
486499
match_list,
487500
});
488501
row_number += 1;
502+
503+
// Early exit after finding first match in quick mode
504+
if flag_quick && final_matched {
505+
break;
506+
}
489507
}
490508
send.send(results).unwrap();
491509
});
@@ -499,7 +517,7 @@ impl Args {
499517
}
500518

501519
// Sort by row_number to maintain original order
502-
all_results.par_sort_by_key(|r| r.row_number);
520+
all_results.par_sort_unstable_by_key(|r| r.row_number);
503521

504522
// Handle --quick mode: find earliest match
505523
if self.flag_quick {
@@ -551,12 +569,17 @@ impl Args {
551569
matched_rows.as_bytes().to_vec()
552570
} else {
553571
total_matches += match_list.len() as u64;
554-
let match_list_str = match_list
555-
.iter()
556-
.map(|i| regex_labels[*i - 1].clone())
557-
.collect::<Vec<String>>()
558-
.join(",");
559-
match_list_with_row = format!("{matched_rows};{match_list_str}");
572+
// builds format!("{matched_rows};{match_list}")
573+
// without intermediate Vec allocation
574+
match_list_with_row.clear();
575+
match_list_with_row.push_str(&matched_rows);
576+
match_list_with_row.push(';');
577+
for (idx, i) in match_list.iter().enumerate() {
578+
if idx > 0 {
579+
match_list_with_row.push(',');
580+
}
581+
match_list_with_row.push_str(&regex_labels[*i - 1]);
582+
}
560583
match_list_with_row.as_bytes().to_vec()
561584
}
562585
} else {

0 commit comments

Comments
 (0)