From a726cc3f8ff684edaee2ecb086c4e44fbdd4e313 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 24 Jun 2025 14:28:01 -0400 Subject: [PATCH 1/2] design a backward algorithm to produce the shortest string --- benchmarks/algorithms.h | 131 +++++++++++++++++++++++++++++++++++++++ benchmarks/benchmark.cpp | 8 ++- 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/benchmarks/algorithms.h b/benchmarks/algorithms.h index 3480c6f..aef27fe 100644 --- a/benchmarks/algorithms.h +++ b/benchmarks/algorithms.h @@ -63,6 +63,126 @@ struct BenchArgs { namespace BenchmarkShortest { + +/** + * We have that std::to_chars does not produce the shortest + * representation for numbers in scientific notation, so we + * optimize the string representation to be shorter. + */ +inline std::string optimize_number_string(const std::string &input) { + // Check if input contains 'E' or 'e' for scientific notation + auto e_pos = input.find_first_of("Ee"); + if (e_pos != std::string::npos) { + // Handle scientific notation + std::string mantissa = input.substr(0, e_pos); + std::string exponent = input.substr(e_pos + 1); + + // Remove leading zeros in exponent, preserving sign + bool negative = exponent[0] == '-'; + exponent.erase(0, negative ? 1 : 0); + exponent.erase(0, exponent.find_first_not_of('0')); + if (exponent.empty()) + exponent = "0"; + if (negative && exponent != "0") + exponent = "-" + exponent; + + // Reconstruct the number + return mantissa + "E" + exponent; + } + + // Handle non-scientific notation + if (input == "0" || input == "-0") + return input; + + // Determine sign + bool is_negative = input[0] == '-'; + std::string num = is_negative ? input.substr(1) : input; + + // Find first and last significant digits + std::string digits = num; + size_t decimal_pos = digits.find('.'); + if (decimal_pos != std::string::npos) { + digits.erase(decimal_pos, 1); // Remove decimal point + } + size_t first_non_zero = digits.find_first_not_of('0'); + size_t last_non_zero = digits.find_last_not_of('0'); + digits = digits.substr(first_non_zero, last_non_zero - first_non_zero + 1); + // Count significant digits + size_t num_digits = digits.length(); + if (num_digits == 0) + return input; + // Calculate exponent + size_t input_decimal_pos = input.find('.'); + size_t input_first_non_zero = input.find_first_not_of('0'); + size_t input_last_non_zero = input.find_last_not_of('0'); + + int exponent = 0; + if (input_decimal_pos == std::string::npos) { + // we have 123232900000 + exponent = (input_last_non_zero - input_first_non_zero); + } else if (input_last_non_zero < input_decimal_pos) { + // Number like 123.456 or 0.456 + exponent = (input_decimal_pos - input_first_non_zero - 1); + } else { + // Number like 0.000123 + exponent = + -static_cast(input.find_first_not_of('0', input_decimal_pos + 1) - + input_decimal_pos); + } + // Calculate scientific notation length + size_t mantissa_len = + num_digits + (num_digits > 1 ? 1 : 0); // Digits + optional decimal + size_t exponent_len = (exponent == 0) ? 1 + : (exponent < 0 ? 1 : 0) + + (std::abs(exponent) < 10 ? 1 + : std::abs(exponent) < 100 ? 2 + : 3); + size_t sci_len = mantissa_len + 1 + exponent_len + + (is_negative ? 1 : 0); // Mantissa + E + exponent + sign + + // Compare lengths + if (sci_len >= input.length()) + return input; + + // Construct scientific notation + std::string result; + if (is_negative) + result += "-"; + result += digits[0]; + if (num_digits > 1) { + result += "."; + result += digits.substr(1); + } + result += "e"; + result += std::to_string(exponent); + + return result; +} + +/** + * This is a special version of std::to_chars that produces the shortest + * representation for numbers. It should not be used for benchmarking. + */ +template +int std_to_chars_shorter(T d, std::span& buffer) { +#if TO_CHARS_SUPPORTED + const auto [p, ec] + = std::to_chars(buffer.data(), buffer.data() + buffer.size(), d); + if (ec != std::errc()) { + std::cerr << "problem with " << d << std::endl; + std::abort(); + } + // This is ridiculous, optimize: + std::string result(buffer.data(), p - buffer.data()); + result = optimize_number_string(result); + std::memcpy(buffer.data(), result.data(), result.size()); + return result.size(); +#else + std::cerr << "std::to_chars not supported" << std::endl; + std::abort(); +#endif +} + template int dragon4(T d, std::span& buffer) { if constexpr (std::is_same_v) @@ -432,6 +552,7 @@ int std_to_chars(T d, std::span& buffer) { #endif } + } // namespace BenchmarksShortest template @@ -441,6 +562,16 @@ auto wrap(int (*fn)(T, std::span&)) { }; } +// Experimental: shorter representation for std::to_chars +// This is not a benchmark, but a utility function to produce the shortest +// representation of a floating-point number using std::to_chars. +// It is not used in the benchmarks, but can be useful for other purposes. +// It is not optimized for performance, but for producing the shortest string. +template +BenchArgs get_std_to_chars_shorter() { + return BenchArgs("std_to_chars_short", wrap(BenchmarkShortest::std_to_chars_shorter), TO_CHARS_SUPPORTED); +} + template std::vector> initArgs(bool use_errol = false, size_t repeat = 0, size_t fixed_size = 0) { std::vector> args; diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 9319852..8362727 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -154,8 +154,14 @@ bool is_exact_integer(float_type x) { // New template version of describe template void describe(const std::variant>, std::vector>> &numbers, - const std::vector> &args, + std::vector> args, const std::vector &algo_filter) { + if constexpr (std::is_same_v) { + args.push_back(get_std_to_chars_shorter()); + } else if constexpr (std::is_same_v) { + args.push_back(get_std_to_chars_shorter()); + } + std::visit([&args, &algo_filter](const auto &lines) { size_t integers64 = 0; size_t integers32 = 0; From 0490d8b44cab857b9cd42de1d3f24122bf442edf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ja=C3=ABl=20Champagne=20Gareau?= Date: Fri, 27 Jun 2025 00:47:37 -0400 Subject: [PATCH 2/2] fix missing case in shortest string generation "1.23e+004" now becomes "1.23e4" instead of staying the same --- benchmarks/algorithms.h | 67 ++++++++++++++++++++-------------------- benchmarks/benchmark.cpp | 37 +++++++++++----------- 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/benchmarks/algorithms.h b/benchmarks/algorithms.h index aef27fe..cb66f24 100644 --- a/benchmarks/algorithms.h +++ b/benchmarks/algorithms.h @@ -44,7 +44,6 @@ #define YY_DOUBLE_SUPPORTED 0 #endif - template struct BenchArgs { using Type = T; @@ -63,7 +62,6 @@ struct BenchArgs { namespace BenchmarkShortest { - /** * We have that std::to_chars does not produce the shortest * representation for numbers in scientific notation, so we @@ -71,15 +69,16 @@ namespace BenchmarkShortest { */ inline std::string optimize_number_string(const std::string &input) { // Check if input contains 'E' or 'e' for scientific notation - auto e_pos = input.find_first_of("Ee"); - if (e_pos != std::string::npos) { + if (const auto e_pos = input.find_first_of("Ee"); + e_pos != std::string::npos) { // Handle scientific notation - std::string mantissa = input.substr(0, e_pos); + const std::string mantissa = input.substr(0, e_pos); std::string exponent = input.substr(e_pos + 1); // Remove leading zeros in exponent, preserving sign - bool negative = exponent[0] == '-'; - exponent.erase(0, negative ? 1 : 0); + const bool negative = exponent[0] == '-'; + const bool positive = exponent[0] == '+'; + exponent.erase(0, (negative || positive) ? 1 : 0); exponent.erase(0, exponent.find_first_not_of('0')); if (exponent.empty()) exponent = "0"; @@ -87,7 +86,7 @@ inline std::string optimize_number_string(const std::string &input) { exponent = "-" + exponent; // Reconstruct the number - return mantissa + "E" + exponent; + return mantissa + "e" + exponent; } // Handle non-scientific notation @@ -95,28 +94,29 @@ inline std::string optimize_number_string(const std::string &input) { return input; // Determine sign - bool is_negative = input[0] == '-'; - std::string num = is_negative ? input.substr(1) : input; + const bool is_negative = input[0] == '-'; // Find first and last significant digits - std::string digits = num; - size_t decimal_pos = digits.find('.'); - if (decimal_pos != std::string::npos) { - digits.erase(decimal_pos, 1); // Remove decimal point + std::string digits = is_negative ? input.substr(1) : input; + if (const size_t decimal_pos = digits.find('.'); + decimal_pos != std::string::npos) { + digits.erase(decimal_pos, 1); // Remove decimal point } - size_t first_non_zero = digits.find_first_not_of('0'); - size_t last_non_zero = digits.find_last_not_of('0'); + const size_t first_non_zero = digits.find_first_not_of('0'); + const size_t last_non_zero = digits.find_last_not_of('0'); digits = digits.substr(first_non_zero, last_non_zero - first_non_zero + 1); + // Count significant digits - size_t num_digits = digits.length(); + const size_t num_digits = digits.length(); if (num_digits == 0) return input; + // Calculate exponent - size_t input_decimal_pos = input.find('.'); - size_t input_first_non_zero = input.find_first_not_of('0'); - size_t input_last_non_zero = input.find_last_not_of('0'); + const size_t input_decimal_pos = input.find('.'); + const size_t input_first_non_zero = input.find_first_not_of('0'); + const size_t input_last_non_zero = input.find_last_not_of('0'); - int exponent = 0; + int exponent; if (input_decimal_pos == std::string::npos) { // we have 123232900000 exponent = (input_last_non_zero - input_first_non_zero); @@ -126,19 +126,21 @@ inline std::string optimize_number_string(const std::string &input) { } else { // Number like 0.000123 exponent = - -static_cast(input.find_first_not_of('0', input_decimal_pos + 1) - - input_decimal_pos); + -static_cast(input.find_first_not_of('0', input_decimal_pos + 1) + - input_decimal_pos); } // Calculate scientific notation length - size_t mantissa_len = - num_digits + (num_digits > 1 ? 1 : 0); // Digits + optional decimal - size_t exponent_len = (exponent == 0) ? 1 - : (exponent < 0 ? 1 : 0) + - (std::abs(exponent) < 10 ? 1 - : std::abs(exponent) < 100 ? 2 - : 3); - size_t sci_len = mantissa_len + 1 + exponent_len + - (is_negative ? 1 : 0); // Mantissa + E + exponent + sign + const size_t mantissa_len = + num_digits + (num_digits > 1 ? 1 : 0); // Digits + optional decimal + const size_t exponent_len = (exponent == 0) + ? 1 + : (exponent < 0 ? 1 : 0) + + (std::abs(exponent) < 10 ? 1 + : std::abs(exponent) < 100 ? 2 + : 3); + const size_t sci_len = + mantissa_len + 1 + exponent_len + + (is_negative ? 1 : 0); // Mantissa + E + exponent + sign // Compare lengths if (sci_len >= input.length()) @@ -552,7 +554,6 @@ int std_to_chars(T d, std::span& buffer) { #endif } - } // namespace BenchmarksShortest template diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 8362727..5a40765 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -37,11 +37,11 @@ void evaluateProperties(const std::vector> &lines, } struct diy_float_t { - diy_float_t(uint64_t significand, int exponent, bool is_negative) - : significand(significand), exponent(exponent), is_negative(is_negative) {} - uint64_t significand; - int exponent; - bool is_negative; + diy_float_t(uint64_t significand, int exponent, bool is_negative) + : significand(significand), exponent(exponent), is_negative(is_negative) {} + uint64_t significand; + int exponent; + bool is_negative; }; template @@ -144,23 +144,24 @@ std::vector> get_random_numbers(size_t howmany, // Checks if a floating-point number is exactly representable as the specified integer type template bool is_exact_integer(float_type x) { - if (!std::isfinite(x)) { - return false; - } - int_type i = static_cast(x); - return static_cast(i) == x; + if (!std::isfinite(x)) { + return false; + } + int_type i = static_cast(x); + return static_cast(i) == x; } // New template version of describe template -void describe(const std::variant>, std::vector>> &numbers, - std::vector> args, - const std::vector &algo_filter) { - if constexpr (std::is_same_v) { - args.push_back(get_std_to_chars_shorter()); - } else if constexpr (std::is_same_v) { - args.push_back(get_std_to_chars_shorter()); - } +void describe(const std::variant>, + std::vector>> &numbers, + std::vector> args, + const std::vector &algo_filter) { + if constexpr (std::is_same_v) { + args.push_back(get_std_to_chars_shorter()); + } else if constexpr (std::is_same_v) { + args.push_back(get_std_to_chars_shorter()); + } std::visit([&args, &algo_filter](const auto &lines) { size_t integers64 = 0;