Skip to content

Commit 29ec7af

Browse files
committed
perf: frequency more microoptimizations
- group all const/statics together - refactor EMPTY_VEC OnceLock to static - expanded safety comments - create simple vis_whitespace bool from large args struct for use in hot loop - inlined some vars
1 parent 775bb88 commit 29ec7af

File tree

1 file changed

+22
-16
lines changed

1 file changed

+22
-16
lines changed

src/cmd/frequency.rs

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,11 @@ const NON_UTF8_ERR: &str = "<Non-UTF8 ERROR>";
238238
const EMPTY_BYTE_VEC: Vec<u8> = Vec::new();
239239
static STATS_RECORDS: OnceLock<HashMap<String, StatsData>> = OnceLock::new();
240240
static NULL_VAL: OnceLock<Vec<u8>> = OnceLock::new();
241+
static UNIQUE_COLUMNS_VEC: OnceLock<Vec<usize>> = OnceLock::new();
242+
static COL_CARDINALITY_VEC: OnceLock<Vec<(String, u64)>> = OnceLock::new();
243+
static FREQ_ROW_COUNT: OnceLock<u64> = OnceLock::new();
244+
static EMPTY_VEC: Vec<(String, u64)> = Vec::new();
245+
static ALL_UNIQUE_TEXT: OnceLock<Vec<u8>> = OnceLock::new();
241246
// FrequencyEntry, FrequencyField and FrequencyOutput are
242247
// structs for JSON output
243248
#[derive(Serialize)]
@@ -287,11 +292,6 @@ struct ProcessedFrequency {
287292
rank: f64,
288293
}
289294

290-
static UNIQUE_COLUMNS_VEC: OnceLock<Vec<usize>> = OnceLock::new();
291-
static COL_CARDINALITY_VEC: OnceLock<Vec<(String, u64)>> = OnceLock::new();
292-
static FREQ_ROW_COUNT: OnceLock<u64> = OnceLock::new();
293-
static EMPTY_VEC: OnceLock<Vec<(String, u64)>> = OnceLock::new();
294-
295295
pub fn run(argv: &[&str]) -> CliResult<()> {
296296
let mut args: Args = util::get_args(USAGE, argv)?;
297297
let mut rconfig = args.rconfig();
@@ -317,12 +317,16 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
317317
util::mem_file_check(&path, false, args.flag_memcheck)?;
318318
}
319319

320-
// Create NULL_VAL and EMPTY_VEC once to avoid repeated to_vec allocations
321-
// safety: we're initializing the start of the program
320+
// Create NULL_VAL & ALL_UNIQUE_TEXT once at the start to avoid
321+
// repeated string & vec allocations in hot loops.
322+
// safety: we're initializing the OnceLocks at the start of the program
322323
NULL_VAL
323324
.set(args.flag_null_text.as_bytes().to_vec())
324325
.unwrap();
325-
EMPTY_VEC.set(Vec::new()).unwrap();
326+
327+
ALL_UNIQUE_TEXT
328+
.set(args.flag_all_unique_text.as_bytes().to_vec())
329+
.unwrap();
326330

327331
let (headers, tables) = if let Some(idx) = args.rconfig().indexed()?
328332
&& util::njobs(args.flag_jobs) > 1
@@ -352,12 +356,14 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
352356
let mut processed_frequencies: Vec<ProcessedFrequency> = Vec::with_capacity(head_ftables.len());
353357
#[allow(unused_assignments)]
354358
let mut value_str = String::with_capacity(100);
359+
let vis_whitespace = args.flag_vis_whitespace;
355360

356361
// safety: we know that UNIQUE_COLUMNS has been previously set
357-
// when compiling frequencies by sel_headers fn
362+
// when compiling frequencies by sel_headers fn in either sequential or parallel mode
358363
let unique_headers_vec = UNIQUE_COLUMNS_VEC.get().unwrap();
359364

360365
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
366+
// write headers
361367
wtr.write_record(vec!["field", "value", "count", "percentage", "rank"])?;
362368

363369
for (i, (header, ftab)) in head_ftables.enumerate() {
@@ -386,7 +392,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
386392

387393
row = vec![
388394
&*header_vec,
389-
if args.flag_vis_whitespace {
395+
if vis_whitespace {
390396
value_str =
391397
util::visualize_whitespace(&String::from_utf8_lossy(&processed_freq.value));
392398
value_str.as_bytes()
@@ -428,7 +434,7 @@ impl Args {
428434
if all_unique_header {
429435
// For all-unique headers, create a single entry
430436
processed_frequencies.push(ProcessedFrequency {
431-
value: self.flag_all_unique_text.as_bytes().to_vec(),
437+
value: ALL_UNIQUE_TEXT.get().unwrap().clone(),
432438
count: row_count,
433439
percentage: 100.0,
434440
formatted_percentage: self.format_percentage(100.0, abs_dec_places),
@@ -763,9 +769,7 @@ impl Args {
763769
// optimize the capacity of the freq_tables based on the cardinality of the columns
764770
// if sequential, use the cardinality from the stats cache
765771
// if parallel, use a default capacity of 1000 for non-unique columns
766-
let col_cardinality_vec = COL_CARDINALITY_VEC
767-
.get()
768-
.unwrap_or(EMPTY_VEC.get().unwrap());
772+
let col_cardinality_vec = COL_CARDINALITY_VEC.get().unwrap_or(&EMPTY_VEC);
769773
let mut freq_tables: Vec<_> = if col_cardinality_vec.is_empty() {
770774
(0..nsel_len)
771775
.map(|_| Frequencies::with_capacity(1000))
@@ -964,7 +968,7 @@ impl Args {
964968
let unique_headers_vec = UNIQUE_COLUMNS_VEC.get().unwrap();
965969
let mut processed_frequencies = Vec::with_capacity(head_ftables.len());
966970
let abs_dec_places = self.flag_pct_dec_places.unsigned_abs() as u32;
967-
let stats_records = STATS_RECORDS.get();
971+
// pre-allocate space for 17 field stats, see list below for details
968972
let mut field_stats: Vec<FieldStats> = Vec::with_capacity(17);
969973

970974
for (i, (header, ftab)) in head_ftables.enumerate() {
@@ -1002,7 +1006,9 @@ impl Args {
10021006
};
10031007

10041008
// Get stats record for this field
1005-
let stats_record = stats_records.and_then(|records| records.get(&field_name));
1009+
let stats_record = STATS_RECORDS
1010+
.get()
1011+
.and_then(|records| records.get(&field_name));
10061012

10071013
// Get data type and nullcount from stats record
10081014
let dtype = stats_record.map_or(String::new(), |sr| sr.r#type.clone());

0 commit comments

Comments
 (0)