From e3b50875f76014c796ef8c1769f3f55937a99397 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 24 Apr 2026 00:26:43 +0800 Subject: [PATCH 01/11] DAOS-18882 pmdk: rebuild heap_curr_allocated from on-media state rebuild heap_curr_allocated from on-media state if the counter has underflowed. Signed-off-by: Liang Zhen --- src/libpmemobj/heap.c | 86 +++++++++++++++++++++++++++++++++++++++++++ src/libpmemobj/heap.h | 3 ++ src/libpmemobj/obj.c | 9 +++++ 3 files changed, 98 insertions(+) diff --git a/src/libpmemobj/heap.c b/src/libpmemobj/heap.c index 64cc19f42d..7b6699bf72 100644 --- a/src/libpmemobj/heap.c +++ b/src/libpmemobj/heap.c @@ -812,6 +812,92 @@ heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, } } +/* + * heap_zone_count_used -- (internal) compute live byte count for one zone + * from the on-media chunk headers and run bitmaps. + */ +static uint64_t +heap_zone_count_used(struct palloc_heap *heap, uint32_t zone_id) +{ + struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); + uint64_t used = 0; + + if (z->header.magic != ZONE_HEADER_MAGIC) + return 0; + + for (uint32_t i = 0; i < z->header.size_idx; ) { + struct chunk_header *hdr = &z->chunk_headers[i]; + ASSERT(hdr->size_idx != 0); + + struct memory_block m = MEMORY_BLOCK_NONE; + m.zone_id = zone_id; + m.chunk_id = i; + m.size_idx = hdr->size_idx; + memblock_rebuild_state(heap, &m); + + switch (hdr->type) { + case CHUNK_TYPE_USED: + used += m.m_ops->get_real_size(&m); + break; + case CHUNK_TYPE_RUN: { + struct chunk_run *run = heap_get_chunk_run(heap, &m); + struct alloc_class *c = alloc_class_by_run( + heap->rt->alloc_classes, + run->hdr.block_size, hdr->flags, m.size_idx); + if (c != NULL) { + struct recycler_element e = + recycler_element_new(heap, &m); + used += (uint64_t)(c->rdsc.nallocs - + e.free_space) * run->hdr.block_size; + } + break; + } + case CHUNK_TYPE_FREE: + break; + default: + ASSERT(0); + } + + i = m.chunk_id + m.size_idx; + } + + return used; +} + +/* + * heap_curr_allocated_repair_if_needed -- rebuild heap_curr_allocated from + * on-media state if the counter has underflowed. + * + * This function is called once during pmemobj_open(): for healthy pools the + * counter is non-negative and this function returns after a single atomic + * load, so overhead in the common case is effectively zero. + */ +void +heap_curr_allocated_repair_if_needed(struct palloc_heap *heap) +{ + uint64_t curr; + util_atomic_load_explicit64( + &heap->stats->persistent->heap_curr_allocated, + &curr, memory_order_acquire); + + if ((int64_t)curr >= 0) + return; + + CORE_LOG_WARNING("heap_curr_allocated underflowed (0x%" PRIx64 + "); rebuilding from on-media state", curr); + + uint64_t total = 0; + for (uint32_t z = 0; z < heap->rt->nzones; ++z) + total += heap_zone_count_used(heap, z); + + util_atomic_store_explicit64( + &heap->stats->persistent->heap_curr_allocated, + total, memory_order_release); + pmemops_persist(&heap->p_ops, + &heap->stats->persistent->heap_curr_allocated, + sizeof(heap->stats->persistent->heap_curr_allocated)); +} + /* * heap_ensure_zone_reclaimed -- make sure that the specified zone has been * already reclaimed. diff --git a/src/libpmemobj/heap.h b/src/libpmemobj/heap.h index 09278bad69..ecbce86757 100644 --- a/src/libpmemobj/heap.h +++ b/src/libpmemobj/heap.h @@ -74,6 +74,9 @@ heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m); void heap_ensure_zone_reclaimed(struct palloc_heap *heap, uint32_t zone_id); +void +heap_curr_allocated_repair_if_needed(struct palloc_heap *heap); + int heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m); diff --git a/src/libpmemobj/obj.c b/src/libpmemobj/obj.c index b5f4f924f4..e1e8e9dfe4 100644 --- a/src/libpmemobj/obj.c +++ b/src/libpmemobj/obj.c @@ -950,6 +950,15 @@ obj_runtime_init(PMEMobjpool *pop, int rdonly, int boot, unsigned nlanes) if ((errno = obj_runtime_init_common(pop)) != 0) goto err_boot; + /* + * heap_curr_allocated is maintained by non-ULOG INC/SUB ops + * that are only flushed at pmemobj_close(). After an + * ungraceful shutdown the counter can be left underflowed. + * Detect and rebuild it here; healthy pools pay only a + * single atomic load. + */ + heap_curr_allocated_repair_if_needed(&pop->heap); + #if VG_MEMCHECK_ENABLED if (On_memcheck) { /* mark unused part of the pool as not accessible */ From 483b054059318823fb818e9666158204d3392932 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Thu, 23 Apr 2026 19:39:27 +0000 Subject: [PATCH 02/11] obj: recalculate curr_allocated on underflow Signed-off-by: Jan Michalski --- ChangeLog | 1 + src/libpmemobj/heap.c | 88 ++++++++++++++++++++++++------------------- src/libpmemobj/heap.h | 2 +- src/libpmemobj/obj.c | 19 +++++----- 4 files changed, 62 insertions(+), 48 deletions(-) diff --git a/ChangeLog b/ChangeLog index 095d7750ac..cc3766218b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,6 +3,7 @@ XXX * Version X.X.X - Fix "The pool was not closed" message (no ADR failure) (daos-stack/pmdk#1, DAOS-18692) + - Recalculate curr_allocated on underflow WA (daos-stack/pmdk#37, DAOS-18882) Fri Jan 16 2026 Oksana Sałyk diff --git a/src/libpmemobj/heap.c b/src/libpmemobj/heap.c index 7b6699bf72..2de42ee285 100644 --- a/src/libpmemobj/heap.c +++ b/src/libpmemobj/heap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "bucket.h" #include "libpmemobj/ctl.h" @@ -813,11 +814,14 @@ heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, } /* - * heap_zone_count_used -- (internal) compute live byte count for one zone - * from the on-media chunk headers and run bitmaps. + * heap_zone_get_allocated -- (internal) sums up the real size of all allocated + * (non-free) memory blocks in the zone. + * + * Note: It is not meant to calculate the sum of all allocations. + * It follows the algorithm used to calculate the heap_curr_allocated statistic. */ static uint64_t -heap_zone_count_used(struct palloc_heap *heap, uint32_t zone_id) +heap_zone_get_allocated(struct palloc_heap *heap, uint32_t zone_id) { struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); uint64_t used = 0; @@ -837,21 +841,9 @@ heap_zone_count_used(struct palloc_heap *heap, uint32_t zone_id) switch (hdr->type) { case CHUNK_TYPE_USED: + case CHUNK_TYPE_RUN: used += m.m_ops->get_real_size(&m); break; - case CHUNK_TYPE_RUN: { - struct chunk_run *run = heap_get_chunk_run(heap, &m); - struct alloc_class *c = alloc_class_by_run( - heap->rt->alloc_classes, - run->hdr.block_size, hdr->flags, m.size_idx); - if (c != NULL) { - struct recycler_element e = - recycler_element_new(heap, &m); - used += (uint64_t)(c->rdsc.nallocs - - e.free_space) * run->hdr.block_size; - } - break; - } case CHUNK_TYPE_FREE: break; default: @@ -864,38 +856,58 @@ heap_zone_count_used(struct palloc_heap *heap, uint32_t zone_id) return used; } +#define CURR_ALLOCATED_UNDERFLOW_FMT \ + "heap_curr_allocated underflowed: %" PRIu64 " > heap.size: %" PRIu64 \ + "; recalculating" + /* - * heap_curr_allocated_repair_if_needed -- rebuild heap_curr_allocated from - * on-media state if the counter has underflowed. + * heap_curr_allocated_fix -- recalculate heap_curr_allocated from scratch if it + * is bigger than heap size + * + * heap_curr_allocated although is stored persistently it is not updated + * transactionally nor reliably persisted. This means e.g. that in case + * a transaction succeeds but the process will get terminated before the update + * of heap_curr_allocated, the value of heap_curr_allocated will get out of sync + * with the actual heap state. + * + * The most obvious case of this happening is when heap_curr_allocated is + * actually smaller than the sum of the sizes of all allocations in the heap, + * so in case all of the allocations are freed, heap_curr_allocated will + * underflow and get a very big value, bigger than heap size. * - * This function is called once during pmemobj_open(): for healthy pools the - * counter is non-negative and this function returns after a single atomic - * load, so overhead in the common case is effectively zero. + * Ref: https://daosio.atlassian.net/browse/DAOS-18882 + * + * This workaround detects this most obvious case and recalculates. + * It is intended to be used during open only. */ void -heap_curr_allocated_repair_if_needed(struct palloc_heap *heap) +heap_curr_allocated_wa(struct palloc_heap *heap) { - uint64_t curr; - util_atomic_load_explicit64( - &heap->stats->persistent->heap_curr_allocated, - &curr, memory_order_acquire); + uint64_t *allocatedp = &heap->stats->persistent->heap_curr_allocated; + uint64_t value; + + /* load the value and check if it is not bigger than heap size */ + util_atomic_load_explicit64(allocatedp, &value, memory_order_acquire); - if ((int64_t)curr >= 0) + if (value <= *heap->sizep) { + /* + * It doesn't mean the value is incorrect, just that it is + * reasonable. + */ return; + } - CORE_LOG_WARNING("heap_curr_allocated underflowed (0x%" PRIx64 - "); rebuilding from on-media state", curr); + CORE_LOG_WARNING(CURR_ALLOCATED_UNDERFLOW_FMT, value, *heap->sizep); - uint64_t total = 0; - for (uint32_t z = 0; z < heap->rt->nzones; ++z) - total += heap_zone_count_used(heap, z); + /* calculate the correct value */ + value = 0; + for (uint32_t z = 0; z < heap->rt->nzones; ++z) { + value += heap_zone_get_allocated(heap, z); + } - util_atomic_store_explicit64( - &heap->stats->persistent->heap_curr_allocated, - total, memory_order_release); - pmemops_persist(&heap->p_ops, - &heap->stats->persistent->heap_curr_allocated, - sizeof(heap->stats->persistent->heap_curr_allocated)); + /* store and persist the corrected value */ + util_atomic_store_explicit64(allocatedp, value, memory_order_release); + pmemops_persist(&heap->p_ops, allocatedp, sizeof(*allocatedp)); } /* diff --git a/src/libpmemobj/heap.h b/src/libpmemobj/heap.h index ecbce86757..a66887ea4d 100644 --- a/src/libpmemobj/heap.h +++ b/src/libpmemobj/heap.h @@ -75,7 +75,7 @@ void heap_ensure_zone_reclaimed(struct palloc_heap *heap, uint32_t zone_id); void -heap_curr_allocated_repair_if_needed(struct palloc_heap *heap); +heap_curr_allocated_wa(struct palloc_heap *heap); int heap_free_chunk_reuse(struct palloc_heap *heap, diff --git a/src/libpmemobj/obj.c b/src/libpmemobj/obj.c index e1e8e9dfe4..6a4e7434bf 100644 --- a/src/libpmemobj/obj.c +++ b/src/libpmemobj/obj.c @@ -21,6 +21,7 @@ #include "ravl.h" #include "heap_layout.h" +#include "heap.h" #include "os.h" #include "os_thread.h" #include "pmemops.h" @@ -950,15 +951,6 @@ obj_runtime_init(PMEMobjpool *pop, int rdonly, int boot, unsigned nlanes) if ((errno = obj_runtime_init_common(pop)) != 0) goto err_boot; - /* - * heap_curr_allocated is maintained by non-ULOG INC/SUB ops - * that are only flushed at pmemobj_close(). After an - * ungraceful shutdown the counter can be left underflowed. - * Detect and rebuild it here; healthy pools pay only a - * single atomic load. - */ - heap_curr_allocated_repair_if_needed(&pop->heap); - #if VG_MEMCHECK_ENABLED if (On_memcheck) { /* mark unused part of the pool as not accessible */ @@ -979,6 +971,15 @@ obj_runtime_init(PMEMobjpool *pop, int rdonly, int boot, unsigned nlanes) ERR_W_ERRNO("critnib_insert to pools_tree"); goto err_tree_insert; } + + /* + * heap_curr_allocated is maintained by non-ULOG INC/SUB ops + * that are only flushed at pmemobj_close(). After an + * ungraceful shutdown the counter can be left underflowed. + * Detect and rebuild it here; healthy pools pay only a + * single atomic load. + */ + heap_curr_allocated_wa(&pop->heap); } if (obj_ctl_init_and_load(pop) != 0) { From 2a712edd3d236a903be16031441a11a8accd7d0c Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Thu, 23 Apr 2026 20:37:08 +0000 Subject: [PATCH 03/11] obj: fix copyrights Signed-off-by: Jan Michalski --- src/libpmemobj/heap.c | 1 + src/libpmemobj/heap.h | 1 + src/libpmemobj/obj.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/libpmemobj/heap.c b/src/libpmemobj/heap.c index 2de42ee285..138d55be4e 100644 --- a/src/libpmemobj/heap.c +++ b/src/libpmemobj/heap.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause /* Copyright 2015-2024, Intel Corporation */ +/* Copyright 2026, Hewlett Packard Enterprise Development LP */ /* * heap.c -- heap implementation diff --git a/src/libpmemobj/heap.h b/src/libpmemobj/heap.h index a66887ea4d..de8c157990 100644 --- a/src/libpmemobj/heap.h +++ b/src/libpmemobj/heap.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* Copyright 2015-2024, Intel Corporation */ +/* Copyright 2026, Hewlett Packard Enterprise Development LP */ /* * heap.h -- internal definitions for heap diff --git a/src/libpmemobj/obj.c b/src/libpmemobj/obj.c index 6a4e7434bf..8d9aded07c 100644 --- a/src/libpmemobj/obj.c +++ b/src/libpmemobj/obj.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause /* Copyright 2014-2024, Intel Corporation */ -/* Copyright 2025, Hewlett Packard Enterprise Development LP */ +/* Copyright 2025-2026, Hewlett Packard Enterprise Development LP */ /* * obj.c -- transactional object store implementation From 62dc4936476cbed568a2fd5c22f7b61d5ac9cd58 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Thu, 23 Apr 2026 20:47:07 +0000 Subject: [PATCH 04/11] obj: remove redundant comment Signed-off-by: Jan Michalski --- src/libpmemobj/obj.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/libpmemobj/obj.c b/src/libpmemobj/obj.c index 8d9aded07c..8275d459fc 100644 --- a/src/libpmemobj/obj.c +++ b/src/libpmemobj/obj.c @@ -972,13 +972,6 @@ obj_runtime_init(PMEMobjpool *pop, int rdonly, int boot, unsigned nlanes) goto err_tree_insert; } - /* - * heap_curr_allocated is maintained by non-ULOG INC/SUB ops - * that are only flushed at pmemobj_close(). After an - * ungraceful shutdown the counter can be left underflowed. - * Detect and rebuild it here; healthy pools pay only a - * single atomic load. - */ heap_curr_allocated_wa(&pop->heap); } From 6d3246fe9d3959a2bb55eb39df354097d35a7c07 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Thu, 23 Apr 2026 22:13:01 +0000 Subject: [PATCH 05/11] obj: add unit test Signed-off-by: Jan Michalski --- src/test/Makefile | 2 + .../.gitignore | 1 + .../obj_ctl_stats_curr_allocated_wa/Makefile | 15 +++ .../obj_ctl_stats_curr_allocated_wa/TEST0 | 27 +++++ .../obj_ctl_stats_curr_allocated_wa.c | 113 ++++++++++++++++++ .../out0.log.match | 11 ++ 6 files changed, 169 insertions(+) create mode 100644 src/test/obj_ctl_stats_curr_allocated_wa/.gitignore create mode 100644 src/test/obj_ctl_stats_curr_allocated_wa/Makefile create mode 100755 src/test/obj_ctl_stats_curr_allocated_wa/TEST0 create mode 100644 src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c create mode 100644 src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match diff --git a/src/test/Makefile b/src/test/Makefile index c223ccae67..dab26479f1 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright 2014-2024, Intel Corporation +# Copyright 2026, Hewlett Packard Enterprise Development LP # # @@ -39,6 +40,7 @@ OBJ_TESTS = \ obj_ctl_debug\ obj_ctl_heap_size\ obj_ctl_stats\ + obj_ctl_stats_curr_allocated_wa\ obj_debug\ obj_defrag\ obj_defrag_advanced\ diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/.gitignore b/src/test/obj_ctl_stats_curr_allocated_wa/.gitignore new file mode 100644 index 0000000000..0de75f5abd --- /dev/null +++ b/src/test/obj_ctl_stats_curr_allocated_wa/.gitignore @@ -0,0 +1 @@ +obj_ctl_stats_curr_allocated_wa diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/Makefile b/src/test/obj_ctl_stats_curr_allocated_wa/Makefile new file mode 100644 index 0000000000..cd8fdcea52 --- /dev/null +++ b/src/test/obj_ctl_stats_curr_allocated_wa/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2026, Hewlett Packard Enterprise Development LP + +# +# src/test/obj_ctl_stats_curr_allocated_wa/Makefile -- +# +TARGET = obj_ctl_stats_curr_allocated_wa +OBJS = obj_ctl_stats_curr_allocated_wa.o + +BUILD_STATIC_DEBUG=n +BUILD_STATIC_NONDEBUG=n + +LIBPMEMOBJ=y + +include ../Makefile.inc diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 b/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 new file mode 100755 index 0000000000..ae7a341da7 --- /dev/null +++ b/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2026, Hewlett Packard Enterprise Development LP + +. ../unittest/unittest.sh + +require_test_type short +require_fs_type any +require_build_type nondebug + +setup + +OUT_TEMP=out${UNITTEST_NUM}_temp.log + +# i - init +# b - break +# c - check +for step in i b c; do + expect_normal_exit ./obj_ctl_stats_curr_allocated_wa$EXESUFFIX $DIR/testfile1 $step + cat $OUT_LOG_FILE >> $OUT_TEMP +done + +mv $OUT_TEMP $OUT_LOG_FILE + +check + +pass diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c new file mode 100644 index 0000000000..080080a159 --- /dev/null +++ b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2017-2023, Intel Corporation */ +/* Copyright 2026, Hewlett Packard Enterprise Development LP */ + +/* + * obj_ctl_stats_curr_allocated_wa.c -- tests for curr_allocated WA + */ + +#include "unittest.h" + +#include <../../libpmemobj/obj.h> + +#define LAYOUT "ctl" + +/* + * init -- creates a pool, enables stats, and allocates an object to make sure + * that stats are initialized. + */ +static void +init(const char *path) +{ + PMEMobjpool *pop; + pop = pmemobj_create(path, LAYOUT, PMEMOBJ_MIN_POOL, S_IWUSR | S_IRUSR); + UT_ASSERTne(pop, NULL); + + int enabled = 1; + uint64_t allocated = 0; + int ret; + + ret = pmemobj_ctl_set(pop, "stats.enabled", &enabled); + UT_ASSERTeq(ret, 0); + + PMEMoid oid; + ret = pmemobj_alloc(pop, &oid, 1, 0, NULL, NULL); + UT_ASSERTeq(ret, 0); + + ret = pmemobj_ctl_get(pop, "stats.heap.curr_allocated", &allocated); + UT_ASSERTeq(ret, 0); + UT_ASSERTne(allocated, 0); + + UT_OUT("curr_allocated: %lu", allocated); + + pmemobj_close(pop); +} + +/* + * break_curr_allocated -- break curr_allocated + */ +static void +break_curr_allocated(const char *path) +{ + PMEMobjpool *pop = pmemobj_open(path, LAYOUT); + UT_ASSERTne(pop, NULL); + + struct pmemobjpool *ppop = (struct pmemobjpool *)pop; + uint64_t *curr_allocated = &ppop->stats_persistent.heap_curr_allocated; + *curr_allocated = UINT64_MAX; + pmem_persist(curr_allocated, sizeof(*curr_allocated)); + + pmemobj_close(pop); +} + +/* + * check -- opens the pool, trigger the curr_allocated WA and checks that it is + * fixed. + */ +static void +check(const char *path) +{ + PMEMobjpool *pop = pmemobj_open(path, LAYOUT); + UT_ASSERTne(pop, NULL); + + uint64_t allocated = 0; + int ret; + + ret = pmemobj_ctl_get(pop, "stats.heap.curr_allocated", &allocated); + UT_ASSERTeq(ret, 0); + UT_ASSERTne(allocated, 0); + UT_ASSERTne(allocated, UINT64_MAX); + + UT_OUT("curr_allocated: %lu", allocated); + + pmemobj_close(pop); +} + +int +main(int argc, char *argv[]) +{ + START(argc, argv, "obj_ctl_stats_curr_allocated_wa"); + + if (argc != 3) { + UT_FATAL("usage: %s file-name step", argv[0]); + } + + const char *path = argv[1]; + const char step = argv[2][0]; + + switch (step) { + case 'i': + init(path); + break; + case 'b': + break_curr_allocated(path); + break; + case 'c': + check(path); + break; + default: + UT_FATAL("invalid step"); + } + + DONE(NULL); +} diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match b/src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match new file mode 100644 index 0000000000..cdc0872153 --- /dev/null +++ b/src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match @@ -0,0 +1,11 @@ +$(S) START: $(S) + ./$(S) $(S) i +curr_allocated: 128 +$(S) DONE +$(S) START: $(S) + ./$(S) $(S) b +$(S) DONE +$(S) START: $(S) + ./$(S) $(S) c +curr_allocated: 128 +$(S) DONE From ad92b377150c01faf95e34584b188b4b0f2b2dd8 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Thu, 23 Apr 2026 22:16:34 +0000 Subject: [PATCH 06/11] obj: fix Copyright Signed-off-by: Jan Michalski --- .../obj_ctl_stats_curr_allocated_wa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c index 080080a159..5cec23300a 100644 --- a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c +++ b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: BSD-3-Clause -/* Copyright 2017-2023, Intel Corporation */ /* Copyright 2026, Hewlett Packard Enterprise Development LP */ /* From 3035869f40def250fa91594de55a70bacd4f90d2 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Fri, 24 Apr 2026 10:33:33 +0000 Subject: [PATCH 07/11] obj: improve underflow detectio and... ... move recalculation to the getter. Signed-off-by: Jan Michalski Co-authored-by: Tomasz Gromadzki --- src/libpmemobj/heap.c | 49 +++++----------------------------------- src/libpmemobj/heap.h | 4 ++-- src/libpmemobj/obj.c | 5 +---- src/libpmemobj/stats.c | 51 +++++++++++++++++++++++++++++++++++++++++- src/libpmemobj/stats.h | 23 +++++++++++++++++-- 5 files changed, 79 insertions(+), 53 deletions(-) diff --git a/src/libpmemobj/heap.c b/src/libpmemobj/heap.c index 138d55be4e..d44bc187aa 100644 --- a/src/libpmemobj/heap.c +++ b/src/libpmemobj/heap.c @@ -857,58 +857,19 @@ heap_zone_get_allocated(struct palloc_heap *heap, uint32_t zone_id) return used; } -#define CURR_ALLOCATED_UNDERFLOW_FMT \ - "heap_curr_allocated underflowed: %" PRIu64 " > heap.size: %" PRIu64 \ - "; recalculating" - /* - * heap_curr_allocated_fix -- recalculate heap_curr_allocated from scratch if it - * is bigger than heap size - * - * heap_curr_allocated although is stored persistently it is not updated - * transactionally nor reliably persisted. This means e.g. that in case - * a transaction succeeds but the process will get terminated before the update - * of heap_curr_allocated, the value of heap_curr_allocated will get out of sync - * with the actual heap state. - * - * The most obvious case of this happening is when heap_curr_allocated is - * actually smaller than the sum of the sizes of all allocations in the heap, - * so in case all of the allocations are freed, heap_curr_allocated will - * underflow and get a very big value, bigger than heap size. - * - * Ref: https://daosio.atlassian.net/browse/DAOS-18882 - * - * This workaround detects this most obvious case and recalculates. - * It is intended to be used during open only. + * heap_curr_allocated_sum -- calculate the curr_allocated statistic */ -void -heap_curr_allocated_wa(struct palloc_heap *heap) +uint64_t +heap_curr_allocated_sum(struct palloc_heap *heap) { - uint64_t *allocatedp = &heap->stats->persistent->heap_curr_allocated; - uint64_t value; - - /* load the value and check if it is not bigger than heap size */ - util_atomic_load_explicit64(allocatedp, &value, memory_order_acquire); - - if (value <= *heap->sizep) { - /* - * It doesn't mean the value is incorrect, just that it is - * reasonable. - */ - return; - } - - CORE_LOG_WARNING(CURR_ALLOCATED_UNDERFLOW_FMT, value, *heap->sizep); + uint64_t value = 0; - /* calculate the correct value */ - value = 0; for (uint32_t z = 0; z < heap->rt->nzones; ++z) { value += heap_zone_get_allocated(heap, z); } - /* store and persist the corrected value */ - util_atomic_store_explicit64(allocatedp, value, memory_order_release); - pmemops_persist(&heap->p_ops, allocatedp, sizeof(*allocatedp)); + return value; } /* diff --git a/src/libpmemobj/heap.h b/src/libpmemobj/heap.h index de8c157990..7793645486 100644 --- a/src/libpmemobj/heap.h +++ b/src/libpmemobj/heap.h @@ -75,8 +75,8 @@ heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m); void heap_ensure_zone_reclaimed(struct palloc_heap *heap, uint32_t zone_id); -void -heap_curr_allocated_wa(struct palloc_heap *heap); +uint64_t +heap_curr_allocated_sum(struct palloc_heap *heap); int heap_free_chunk_reuse(struct palloc_heap *heap, diff --git a/src/libpmemobj/obj.c b/src/libpmemobj/obj.c index 8275d459fc..b5f4f924f4 100644 --- a/src/libpmemobj/obj.c +++ b/src/libpmemobj/obj.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause /* Copyright 2014-2024, Intel Corporation */ -/* Copyright 2025-2026, Hewlett Packard Enterprise Development LP */ +/* Copyright 2025, Hewlett Packard Enterprise Development LP */ /* * obj.c -- transactional object store implementation @@ -21,7 +21,6 @@ #include "ravl.h" #include "heap_layout.h" -#include "heap.h" #include "os.h" #include "os_thread.h" #include "pmemops.h" @@ -971,8 +970,6 @@ obj_runtime_init(PMEMobjpool *pop, int rdonly, int boot, unsigned nlanes) ERR_W_ERRNO("critnib_insert to pools_tree"); goto err_tree_insert; } - - heap_curr_allocated_wa(&pop->heap); } if (obj_ctl_init_and_load(pop) != 0) { diff --git a/src/libpmemobj/stats.c b/src/libpmemobj/stats.c index 2f3e23e302..2359626b5d 100644 --- a/src/libpmemobj/stats.c +++ b/src/libpmemobj/stats.c @@ -5,11 +5,14 @@ * stats.c -- implementation of statistics */ +#include "heap.h" #include "obj.h" #include "stats.h" #include "core_assert.h" -STATS_CTL_HANDLER(persistent, curr_allocated, heap_curr_allocated); +static int +CTL_READ_HANDLER(persistent_curr_allocated)(void *ctx, + enum ctl_query_source source, void *arg, struct ctl_indexes *indexes); STATS_CTL_HANDLER(transient, run_allocated, heap_run_allocated); STATS_CTL_HANDLER(transient, run_active, heap_run_active); @@ -22,6 +25,52 @@ static const struct ctl_node CTL_NODE(heap)[] = { CTL_NODE_END }; +/* + * CTL_READ_HANDLER(persistent_curr_allocated) -- returns curr_allocated field + */ +static int +CTL_READ_HANDLER(persistent_curr_allocated)(void *ctx, + enum ctl_query_source source, void *arg, struct ctl_indexes *indexes) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(source, indexes); + + PMEMobjpool *pop = ctx; + uint64_t *curr_allocated = &pop->stats->persistent->heap_curr_allocated; + + uint64_t *argv = arg; + util_atomic_load_explicit64(curr_allocated, argv, memory_order_acquire); + + /* + * heap_curr_allocated although is stored persistently it is not updated + * transactionally nor reliably persisted. This means e.g. that in case + * a transaction succeeds but the process will get terminated before + * the update of heap_curr_allocated, the value of heap_curr_allocated + * will get out of sync with the actual heap state. + * + * The most obvious case of this happening is when heap_curr_allocated + * is actually smaller than the sum of the sizes of all allocations in + * the heap, so in case all of the allocations are freed, + * heap_curr_allocated will underflow and get a very big value, bigger + * than heap size. + * + * Ref: https://daosio.atlassian.net/browse/DAOS-18882 + * + * This workaround detects this most obvious case and recalculates. + * + * Note: Not thread-safe. + */ + if (*argv > *pop->heap.sizep) { /* covers the == UINT64_MAX case */ + /* if the value is broken, recalculate it */ + *argv = heap_curr_allocated_sum(&pop->heap); + + util_atomic_store_explicit64(curr_allocated, *argv, + memory_order_release); + } + + return 0; +} + /* * CTL_READ_HANDLER(enabled) -- returns whether or not statistics are enabled */ diff --git a/src/libpmemobj/stats.h b/src/libpmemobj/stats.h index 72eeceb896..f7a6bac264 100644 --- a/src/libpmemobj/stats.h +++ b/src/libpmemobj/stats.h @@ -56,10 +56,29 @@ struct stats { util_fetch_and_sub64((&(stats)->transient->name), (value));\ } while (0) +/* + * This macro handles only the curr_allocated statistic. Sadly, it's current + * design makes it possible for it to underflow, which would lead to very large + * values due to the unsigned type. To mitigate this, if the value is already at + * UINT64_MAX or the subtraction would cause an underflow, the value is set to + * UINT64_MAX to indicate that it's broken and needs to be recalculated. + */ #define STATS_SUB_persistent(stats, name, value) do {\ if ((stats)->enabled == POBJ_STATS_ENABLED_PERSISTENT ||\ - (stats)->enabled == POBJ_STATS_ENABLED_BOTH)\ - util_fetch_and_sub64((&(stats)->persistent->name), (value));\ + (stats)->enabled == POBJ_STATS_ENABLED_BOTH) {\ + uint64_t curr_value;\ + util_atomic_load_explicit64((&(stats)->persistent->name),\ + &curr_value, memory_order_acquire);\ + if (curr_value != UINT64_MAX) {\ + curr_value = util_fetch_and_sub64(\ + (&(stats)->persistent->name), (value));\ + if (curr_value < value || curr_value == UINT64_MAX) {\ + util_atomic_store_explicit64(\ + (&(stats)->persistent->name),\ + UINT64_MAX, memory_order_release);\ + }\ + }\ + }\ } while (0) #define STATS_SET(stats, type, name, value) do {\ From 2dbdb9aed96a8401343843aa7bb45e003bde11d1 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Fri, 24 Apr 2026 10:59:41 +0000 Subject: [PATCH 08/11] obj: restore the warning message Signed-off-by: Jan Michalski --- src/libpmemobj/stats.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/libpmemobj/stats.c b/src/libpmemobj/stats.c index 2359626b5d..6174022d9c 100644 --- a/src/libpmemobj/stats.c +++ b/src/libpmemobj/stats.c @@ -5,6 +5,8 @@ * stats.c -- implementation of statistics */ +#include + #include "heap.h" #include "obj.h" #include "stats.h" @@ -25,6 +27,10 @@ static const struct ctl_node CTL_NODE(heap)[] = { CTL_NODE_END }; +#define CURR_ALLOCATED_UNDERFLOW_FMT \ + "heap_curr_allocated underflowed: %" PRIu64 " > heap.size: %" PRIu64 \ + "; recalculating" + /* * CTL_READ_HANDLER(persistent_curr_allocated) -- returns curr_allocated field */ @@ -61,6 +67,9 @@ CTL_READ_HANDLER(persistent_curr_allocated)(void *ctx, * Note: Not thread-safe. */ if (*argv > *pop->heap.sizep) { /* covers the == UINT64_MAX case */ + CORE_LOG_WARNING(CURR_ALLOCATED_UNDERFLOW_FMT, + *argv, *pop->heap.sizep); + /* if the value is broken, recalculate it */ *argv = heap_curr_allocated_sum(&pop->heap); From ee9a1b5d7ce0763554bb9f3879e0f07da8b85b54 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Fri, 24 Apr 2026 11:03:44 +0000 Subject: [PATCH 09/11] obj: remove redundant header Signed-off-by: Jan Michalski --- src/libpmemobj/heap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/libpmemobj/heap.c b/src/libpmemobj/heap.c index d44bc187aa..443dea5f29 100644 --- a/src/libpmemobj/heap.c +++ b/src/libpmemobj/heap.c @@ -10,7 +10,6 @@ #include #include #include -#include #include "bucket.h" #include "libpmemobj/ctl.h" From 9a5528362d3d5839f0e2f1a186c7c0e05b68fbc9 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Fri, 24 Apr 2026 13:23:06 +0000 Subject: [PATCH 10/11] obj: fixes + updated unit test Signed-off-by: Jan Michalski --- src/libpmemobj/heap.c | 1 + src/libpmemobj/stats.h | 11 +- .../obj_ctl_stats_curr_allocated_wa/TEST0 | 14 +- .../obj_ctl_stats_curr_allocated_wa.c | 125 ++++++++++-------- .../out0.log.match | 11 -- .../pmemobj_grep0.log.match | 2 + 6 files changed, 83 insertions(+), 81 deletions(-) delete mode 100644 src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match create mode 100644 src/test/obj_ctl_stats_curr_allocated_wa/pmemobj_grep0.log.match diff --git a/src/libpmemobj/heap.c b/src/libpmemobj/heap.c index 443dea5f29..8c07987f29 100644 --- a/src/libpmemobj/heap.c +++ b/src/libpmemobj/heap.c @@ -819,6 +819,7 @@ heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, * * Note: It is not meant to calculate the sum of all allocations. * It follows the algorithm used to calculate the heap_curr_allocated statistic. + * e.g. It does not take into account free space in runs. */ static uint64_t heap_zone_get_allocated(struct palloc_heap *heap, uint32_t zone_id) diff --git a/src/libpmemobj/stats.h b/src/libpmemobj/stats.h index f7a6bac264..213e727d3b 100644 --- a/src/libpmemobj/stats.h +++ b/src/libpmemobj/stats.h @@ -42,8 +42,15 @@ struct stats { #define STATS_INC_persistent(stats, name, value) do {\ if ((stats)->enabled == POBJ_STATS_ENABLED_PERSISTENT ||\ - (stats)->enabled == POBJ_STATS_ENABLED_BOTH)\ - util_fetch_and_add64((&(stats)->persistent->name), (value));\ + (stats)->enabled == POBJ_STATS_ENABLED_BOTH) {\ + uint64_t curr_value;\ + util_atomic_load_explicit64((&(stats)->persistent->name),\ + &curr_value, memory_order_acquire);\ + if (curr_value != UINT64_MAX) {\ + util_fetch_and_add64(\ + (&(stats)->persistent->name), (value));\ + }\ + }\ } while (0) #define STATS_SUB(stats, type, name, value) do {\ diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 b/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 index ae7a341da7..786402641c 100755 --- a/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 +++ b/src/test/obj_ctl_stats_curr_allocated_wa/TEST0 @@ -6,21 +6,15 @@ require_test_type short require_fs_type any -require_build_type nondebug +require_build_type debug setup -OUT_TEMP=out${UNITTEST_NUM}_temp.log +OUT_CHECK=pmemobj_grep${UNITTEST_NUM}.log -# i - init -# b - break -# c - check -for step in i b c; do - expect_normal_exit ./obj_ctl_stats_curr_allocated_wa$EXESUFFIX $DIR/testfile1 $step - cat $OUT_LOG_FILE >> $OUT_TEMP -done +expect_normal_exit ./obj_ctl_stats_curr_allocated_wa$EXESUFFIX $DIR/testfile1 -mv $OUT_TEMP $OUT_LOG_FILE +cat $PMEMOBJ_LOG_FILE | grep "ctl__persistent_curr_allocated_read" > $OUT_CHECK check diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c index 5cec23300a..83fd1ec151 100644 --- a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c +++ b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c @@ -11,75 +11,84 @@ #define LAYOUT "ctl" +#define MB (1 << 20) + /* - * init -- creates a pool, enables stats, and allocates an object to make sure - * that stats are initialized. + * set_curr_allocated -- set curr_allocated to a specific value */ static void -init(const char *path) +set_curr_allocated(PMEMobjpool *pop, uint64_t value) { - PMEMobjpool *pop; - pop = pmemobj_create(path, LAYOUT, PMEMOBJ_MIN_POOL, S_IWUSR | S_IRUSR); - UT_ASSERTne(pop, NULL); + struct pmemobjpool *ppop = (struct pmemobjpool *)pop; + uint64_t *curr_allocated = &ppop->stats_persistent.heap_curr_allocated; + *curr_allocated = value; + pmem_persist(curr_allocated, sizeof(*curr_allocated)); +} - int enabled = 1; - uint64_t allocated = 0; +/* + * get_curr_allocated -- get the curr_allocated value + */ +static uint64_t +get_curr_allocated(PMEMobjpool *pop) +{ int ret; - - ret = pmemobj_ctl_set(pop, "stats.enabled", &enabled); - UT_ASSERTeq(ret, 0); - - PMEMoid oid; - ret = pmemobj_alloc(pop, &oid, 1, 0, NULL, NULL); - UT_ASSERTeq(ret, 0); + uint64_t allocated; ret = pmemobj_ctl_get(pop, "stats.heap.curr_allocated", &allocated); UT_ASSERTeq(ret, 0); - UT_ASSERTne(allocated, 0); - UT_OUT("curr_allocated: %lu", allocated); - - pmemobj_close(pop); + return allocated; } /* - * break_curr_allocated -- break curr_allocated + * huge_alloc -- make a huge allocation */ static void -break_curr_allocated(const char *path) +huge_alloc(PMEMobjpool *pop, PMEMoid *oid) { - PMEMobjpool *pop = pmemobj_open(path, LAYOUT); - UT_ASSERTne(pop, NULL); + int ret = pmemobj_alloc(pop, oid, 4 * MB, 0, NULL, NULL); + UT_ASSERTeq(ret, 0); +} - struct pmemobjpool *ppop = (struct pmemobjpool *)pop; - uint64_t *curr_allocated = &ppop->stats_persistent.heap_curr_allocated; - *curr_allocated = UINT64_MAX; - pmem_persist(curr_allocated, sizeof(*curr_allocated)); +static void +underflow(PMEMobjpool *pop, uint64_t *exp_allocated) +{ + UT_ASSERTne(exp_allocated, NULL); - pmemobj_close(pop); + /* Create a huge allocation and query the statistic */ + PMEMoid oid; + huge_alloc(pop, &oid); + + *exp_allocated = get_curr_allocated(pop); + + /* Inject a smaller than expected value. */ + set_curr_allocated(pop, *exp_allocated - 1); + + /* Trigger underflow. */ + pmemobj_free(&oid); } /* - * check -- opens the pool, trigger the curr_allocated WA and checks that it is - * fixed. + * test -- allocates an object to make sure that stats are initialized. */ static void -check(const char *path) +test(PMEMobjpool *pop) { - PMEMobjpool *pop = pmemobj_open(path, LAYOUT); - UT_ASSERTne(pop, NULL); + PMEMoid oid; + uint64_t exp_allocated; - uint64_t allocated = 0; - int ret; + /* trigger underflow */ + underflow(pop, &exp_allocated); - ret = pmemobj_ctl_get(pop, "stats.heap.curr_allocated", &allocated); - UT_ASSERTeq(ret, 0); - UT_ASSERTne(allocated, 0); - UT_ASSERTne(allocated, UINT64_MAX); + /* Check the statistic is updated correctly. */ + UT_ASSERTeq(get_curr_allocated(pop), 0); - UT_OUT("curr_allocated: %lu", allocated); + /* trigger underflow and attempt crossing zero again */ + underflow(pop, &exp_allocated); + huge_alloc(pop, &oid); - pmemobj_close(pop); + /* Check the statistic is updated correctly. */ + UT_ASSERTeq(get_curr_allocated(pop), exp_allocated); } int @@ -87,26 +96,26 @@ main(int argc, char *argv[]) { START(argc, argv, "obj_ctl_stats_curr_allocated_wa"); - if (argc != 3) { - UT_FATAL("usage: %s file-name step", argv[0]); + if (argc != 2) { + UT_FATAL("usage: %s file-name", argv[0]); } const char *path = argv[1]; - const char step = argv[2][0]; - - switch (step) { - case 'i': - init(path); - break; - case 'b': - break_curr_allocated(path); - break; - case 'c': - check(path); - break; - default: - UT_FATAL("invalid step"); - } + + PMEMobjpool *pop; + pop = pmemobj_create(path, LAYOUT, PMEMOBJ_MIN_POOL, S_IWUSR | S_IRUSR); + UT_ASSERTne(pop, NULL); + + int enabled = 1; + int ret; + + /* Enable stats. */ + ret = pmemobj_ctl_set(pop, "stats.enabled", &enabled); + UT_ASSERTeq(ret, 0); + + test(pop); + + pmemobj_close(pop); DONE(NULL); } diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match b/src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match deleted file mode 100644 index cdc0872153..0000000000 --- a/src/test/obj_ctl_stats_curr_allocated_wa/out0.log.match +++ /dev/null @@ -1,11 +0,0 @@ -$(S) START: $(S) - ./$(S) $(S) i -curr_allocated: 128 -$(S) DONE -$(S) START: $(S) - ./$(S) $(S) b -$(S) DONE -$(S) START: $(S) - ./$(S) $(S) c -curr_allocated: 128 -$(S) DONE diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/pmemobj_grep0.log.match b/src/test/obj_ctl_stats_curr_allocated_wa/pmemobj_grep0.log.match new file mode 100644 index 0000000000..dd72bd43e6 --- /dev/null +++ b/src/test/obj_ctl_stats_curr_allocated_wa/pmemobj_grep0.log.match @@ -0,0 +1,2 @@ +$(S) heap_curr_allocated underflowed: $(N) > heap.size: $(N); recalculating +$(S) heap_curr_allocated underflowed: $(N) > heap.size: $(N); recalculating From 2ccfdbcb8a1230a39fb8e15d255f2c87516386d6 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Fri, 24 Apr 2026 13:25:43 +0000 Subject: [PATCH 11/11] obj: fix comments in the unit test Signed-off-by: Jan Michalski --- .../obj_ctl_stats_curr_allocated_wa.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c index 83fd1ec151..8a78a43c52 100644 --- a/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c +++ b/src/test/obj_ctl_stats_curr_allocated_wa/obj_ctl_stats_curr_allocated_wa.c @@ -68,22 +68,19 @@ underflow(PMEMobjpool *pop, uint64_t *exp_allocated) pmemobj_free(&oid); } -/* - * test -- allocates an object to make sure that stats are initialized. - */ static void test(PMEMobjpool *pop) { PMEMoid oid; uint64_t exp_allocated; - /* trigger underflow */ + /* 1. trigger underflow */ underflow(pop, &exp_allocated); /* Check the statistic is updated correctly. */ UT_ASSERTeq(get_curr_allocated(pop), 0); - /* trigger underflow and attempt crossing zero again */ + /* 2. trigger underflow and attempt crossing zero again */ underflow(pop, &exp_allocated); huge_alloc(pop, &oid);