Skip to content

Commit

Permalink
Bug fix monotonic on arm (#333)
Browse files Browse the repository at this point in the history
This PR fixes the implementation of intrinsic_tsc on AARCH64 to match
the scale of values returned on X86_64 and switch to leverage special
registries to fetch the value instead of trying to calculate them.

Also rehaul the fallback mechanism to calculate the cpu clocks per
second to make it more reliable and precise (and faster).
  • Loading branch information
danielealbano committed Apr 13, 2023
1 parent 1a0193b commit 5b6a488
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 54 deletions.
1 change: 1 addition & 0 deletions src/clock.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <unistd.h>

#include "misc.h"
Expand Down
4 changes: 2 additions & 2 deletions src/clock.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ static inline __attribute__((always_inline)) int64_t clock_timespec_to_int64_ms(
static inline __attribute__((always_inline)) void clock_monotonic(
timespec_t *timespec) {
uint64_t cycles = intrinsics_tsc();
uint64_t cycles_per_second = intrinsics_cycles_per_second_get();
uint64_t cycles_per_second = intrinsics_frequency_max();
uint64_t seconds = cycles / cycles_per_second;
uint64_t remaining_cycles = cycles % cycles_per_second;

Expand All @@ -61,7 +61,7 @@ static inline __attribute__((always_inline)) void clock_monotonic(

static inline __attribute__((always_inline)) int64_t clock_monotonic_int64_ms() {
uint64_t cycles = intrinsics_tsc();
uint64_t cycles_per_second = intrinsics_cycles_per_second_get();
uint64_t cycles_per_second = intrinsics_frequency_max();

uint64_t quotient = 1000ULL / cycles_per_second;
uint64_t remainder = 1000ULL % cycles_per_second;
Expand Down
112 changes: 92 additions & 20 deletions src/intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,113 @@
**/

#include <stdint.h>
#include <stdbool.h>
#include <unistd.h>

#if defined(__x86_64__)
#include <cpuid.h>
#endif

#include "misc.h"

#include "intrinsics.h"

uint64_t intrinsics_cycles_per_second = 0;
uint64_t intrinsics_frequency_max_internal = 0;

FUNCTION_CTOR(intrinsics_cycles_per_second_init, {
intrinsics_cycles_per_second_calibrate();
intrinsics_frequency_max_internal = intrinsics_frequency_max();
})

void intrinsics_cycles_per_second_calibrate() {
intrinsics_cycles_per_second = intrinsics_cycles_per_second_calculate();
#if defined(__aarch64__)
uint64_t intrinsics_frequency_max_calculate_aarch64_cntfrq_el0() {
uint64_t frq;
asm volatile(
"mrs %0, cntfrq_el0"
: "=r"(frq));

// Adjust the scale for compatibility with the x86_64 implementation
return (uint32_t)frq * 100;
}
#endif

#if defined(__x86_64__)
uint64_t intrinsics_frequency_calculate_max_x64_cpuid_level_16h() {
uint32_t eax, ebx, ecx, edx;
if (__get_cpuid_max(0, NULL) < 0x16) {
return 0;
}

__cpuid_count(0x16, 0, eax, ebx, ecx, edx);

if (eax == 0) {
return 0;
}

uint64_t cycles = ebx;
cycles *= 1000;

return cycles;
}
#endif

uint64_t intrinsics_frequency_max_calculate_simple() {
uint64_t hz_per_second;
uint64_t mhz_per_second;

uint64_t intrinsics_cycles_per_second_calculate() {
uint64_t loops = 3;
uint64_t loop_wait_ms = 300;
uint64_t cycles_per_second_sum = 0;
// Calculate the amount of cycles in a second
uint64_t start = intrinsics_tsc();
sleep(1);
uint64_t end = intrinsics_tsc();

// Calculate the average cycles per second
for(int i = 0; i < loops; i++) {
uint64_t start = intrinsics_tsc();
usleep(loop_wait_ms * 1000);
cycles_per_second_sum += intrinsics_tsc() - start;
// Check for overflow
if (end < start) {
hz_per_second = ((UINT64_MAX - start) + end);
} else {
hz_per_second = (end - start);
}

// Calculate the average cycles per second
uint64_t cycles_per_second_internal =
(uint64_t)(((double)cycles_per_second_sum / (double)loops) * (1000.0 / (double)loop_wait_ms));
// Calculate the mhz for rounding
mhz_per_second = hz_per_second / 1000000;

// Modern CPUs usually have GHz frequencies with 1 digit and one decimal, for example 42007283080 Hz is actually
// 4.2 GHz (or 4200 MHz) so we need to divide by 10000000 and then multiply it back to get the correct value
cycles_per_second_internal = (uint64_t)((uint64_t)cycles_per_second_internal / 10000000) * 10000000;
// Round to the nearest 50 MHz
if ((mhz_per_second % 50) > 15) {
mhz_per_second = ((mhz_per_second / 50) * 50) + 50;
} else {
mhz_per_second = ((mhz_per_second / 50) * 50);
}

// Convert back to hz
hz_per_second = mhz_per_second * 1000000;

return hz_per_second;
}

uint64_t intrinsics_frequency_max_calculate() {
uint64_t return_value;

#if defined(__x86_64__)
return_value = intrinsics_frequency_calculate_max_x64_cpuid_level_16h();
#elif defined(__aarch64__)
return_value = intrinsics_frequency_max_calculate_aarch64_cntfrq_el0();
#else
return_value = 0;
#endif

// If there is no way to calculate the frequency leveraging specific instructions or registers fallback to a simple
// calculation
if (return_value == 0) {
return_value = intrinsics_frequency_max_calculate_simple();
}

return return_value;
}

return cycles_per_second_internal;
bool intrinsics_frequency_max_estimated() {
#if defined(__x86_64__)
return intrinsics_frequency_calculate_max_x64_cpuid_level_16h() > 0 ? false : true;
#elif defined(__aarch64__)
return false;
#else
#error "Unsupported architecture"
#endif
}
28 changes: 22 additions & 6 deletions src/intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,18 +1,32 @@
#ifndef CACHEGRAND_INTRINSICS_H
#define CACHEGRAND_INTRINSICS_H

#include "misc.h"

#ifdef __cplusplus
extern "C" {
#endif

extern uint64_t intrinsics_cycles_per_second;
extern uint64_t intrinsics_frequency_max_internal;

#if defined(__x86_64__)
uint64_t intrinsics_frequency_max_calculate_x64_cpuid_level_16h();
#elif defined(__aarch64__)
uint64_t intrinsics_frequency_max_calculate_aarch64_cntfrq_el0();
#endif

uint64_t intrinsics_frequency_max_calculate_simple();

void intrinsics_cycles_per_second_calibrate();
uint64_t intrinsics_frequency_max_calculate();

uint64_t intrinsics_cycles_per_second_calculate();
bool intrinsics_frequency_max_estimated();

static inline uint64_t intrinsics_cycles_per_second_get() {
return intrinsics_cycles_per_second;
static inline uint64_t intrinsics_frequency_max() {
if (unlikely(intrinsics_frequency_max_internal == 0)) {
intrinsics_frequency_max_internal = intrinsics_frequency_max_calculate();
}

return intrinsics_frequency_max_internal;
}

static inline uint64_t intrinsics_tsc() {
Expand All @@ -27,7 +41,9 @@ static inline uint64_t intrinsics_tsc() {
asm volatile (
"mrs %0, cntvct_el0"
: "=r"(tsc));
return (uint64_t) tsc;

// Adjust the scale for compatibility with the x86_64 implementation
return (uint64_t)tsc * 100;
#else
#error "unsupported platform"
#endif
Expand Down
10 changes: 5 additions & 5 deletions tests/unit_tests/test-clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,14 @@ TEST_CASE("clock.c", "[clock]") {
SECTION("clock_monotonic_int64_ms") {
timespec_t a;
clock_monotonic(&a);
int64_t a1 = clock_timespec_to_int64_ms(&a);
int64_t b = clock_monotonic_int64_ms();
int64_t a1 = clock_timespec_to_int64_ms(&a);

// Allow up to 1ms of difference although these 2 clock reads are executed right after so there shouldn't be any
int64_t diff = b - a1;

// Allow up to 1 ms of difference
if (diff <= 1) {
// Allow up to 1s of difference, the monotonic clock in cachegrand might be calculated differently
if (diff == 1) {
diff = 0;
}

Expand All @@ -152,8 +152,8 @@ TEST_CASE("clock.c", "[clock]") {
// Allow up to res in ms of difference
int64_t diff = b - clock_timespec_to_int64_ms(&a);

// Allow up to res_ms of difference
if (diff <= res_ms) {
// Allow up to 1s plus res_ms of difference, the monotonic clock in cachegrand might be calculated differently
if (diff <= 1 + res_ms) {
diff = 0;
}

Expand Down
41 changes: 20 additions & 21 deletions tests/unit_tests/test-intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,30 @@
#include <cstdint>
#include <unistd.h>

#include "intrinsics.h"

double test_intrinsics_read_cpu_freq_from_cpuinfo() {
std::ifstream cpuinfo("/proc/cpuinfo");
std::string line;
#if defined(__x86_64__)
#include <cpuid.h>
#endif

while (std::getline(cpuinfo, line)) {
if (line.find("cpu MHz") != std::string::npos) {
std::size_t pos = line.find(':');
double cpu_freq = std::stod(line.substr(pos + 1));
return cpu_freq * 1000000; // Convert MHz to Hz
}
}
throw std::runtime_error("Cannot find 'cpu MHz' in /proc/cpuinfo");
}
#include "intrinsics.h"

TEST_CASE("intrinsics.c", "[intrinsics]") {
SECTION("intrinsics_cycles_per_second_calculate") {
uint64_t cycles_per_second = intrinsics_cycles_per_second_calculate();
double cpu_freq_from_cpuinfo = test_intrinsics_read_cpu_freq_from_cpuinfo();
SECTION("intrinsics_frequency_max") {
uint64_t cycles_per_second = intrinsics_frequency_max_calculate();
uint64_t cpu_freq_simple = intrinsics_frequency_max_calculate_simple();

// Since there can be some variation in the frequency, we will use a tolerance
double tolerance = 0.02 * cpu_freq_from_cpuinfo; // 2% tolerance
REQUIRE(cycles_per_second == cpu_freq_simple);
}

REQUIRE(cycles_per_second >= (cpu_freq_from_cpuinfo - tolerance));
REQUIRE(cycles_per_second <= (cpu_freq_from_cpuinfo + tolerance));
SECTION("intrinsics_frequency_max_estimated") {
bool expected_result;
#if defined(__x86_64__)
expected_result = __get_cpuid_max(0, nullptr) < 16 ? false : true;
#elif defined(__aarch64__)
expected_result = false;
#else
expected_result = true;
#endif
// On the test hardware it should never be estimated, better to catch if it is
REQUIRE(intrinsics_frequency_max_estimated() == expected_result);
}
}

0 comments on commit 5b6a488

Please sign in to comment.