From 30322a50a2da6b78145c615c4dfaeb9935e842aa Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 5 Jun 2026 22:51:27 +0200 Subject: [PATCH 1/5] fix(satellite): decouple rs-discard-granularity from discard-zeroes gating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A partially-written FILE_THIN volume resynced ~2x the bytes upstream LINSTOR did because the rendered .res lacked rs-discard-granularity. autoDiskOptions() early-returned nil for any provider that was not discard-zeroes-safe, coupling rs-discard-granularity (which upstream gates ONLY on the backing device's reported discard granularity) to discard-zeroes-if-aligned (correctly provider-gated). Decouple the two gates to mirror upstream: - discard-zeroes-if-aligned: provider-gated as before, but now rendered explicitly as `no` for non-safe kinds (FILE_THIN, thick LVM, FILE) instead of omitting the whole block — matching upstream 1.33.2's render. - rs-discard-granularity: emitted whenever the backing device reports a non-zero discard granularity (lsblk DISC-GRAN > 0), independent of provider kind, so an aligned all-zero region is UNMAPped during resync instead of transferred. Multi-volume collapse stays conservative: discard-zeroes-if-aligned is `yes` only when every volume is safe (one `no` pins the resource), and rs-discard-granularity is the smallest across volumes. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/satellite/discardgran.go | 145 +++++++++++++++++------------- pkg/satellite/discardgran_test.go | 136 ++++++++++++++++++++++------ 2 files changed, 192 insertions(+), 89 deletions(-) diff --git a/pkg/satellite/discardgran.go b/pkg/satellite/discardgran.go index b30d9cee..acab3621 100644 --- a/pkg/satellite/discardgran.go +++ b/pkg/satellite/discardgran.go @@ -52,8 +52,9 @@ const ( diskOptRsDiscardGranularity = "rs-discard-granularity" diskOptDiscardZeroesAligned = "discard-zeroes-if-aligned" - // drbdYes is DRBD's affirmative value for a boolean disk option. + // drbdYes / drbdNo are DRBD's boolean disk-option values. drbdYes = "yes" + drbdNo = "no" ) // DRBD's accepted bounds for rs-discard-granularity. Mirrors upstream @@ -72,17 +73,20 @@ const ( // resource-scope `disk { }` block (which DRBD applies to every // volume). // -// It probes each local diskful volume's backing device and is -// deliberately CONSERVATIVE for the (today hypothetical) multi-volume -// case: the options are emitted only when EVERY local diskful volume's -// provider is discard-zero-safe; if any volume is thick/file/unknown -// the whole block is omitted (safe full resync for all). The -// rs-discard-granularity, when present, is the SMALLEST across the -// volumes so the single resource-scope value stays a valid multiple of -// every device's discard granularity — never producing an UNDER-aligned -// UNMAP. A volume whose device can't be probed simply contributes no -// granularity (discard-zeroes-if-aligned still applies; DRBD falls back -// to a full copy where it can't discard). +// It probes each local diskful volume's backing device. Each volume +// contributes a discard-zeroes-if-aligned flag (yes/no per provider +// kind) and, when its device reports a non-zero discard granularity, an +// rs-discard-granularity value — INDEPENDENT gates (see autoDiskOptions). +// +// The resource-scope collapse is CONSERVATIVE for the (today +// hypothetical) multi-volume case (mergeResourceDiscardOptions): +// discard-zeroes-if-aligned is "yes" only when EVERY volume is +// discard-zero-safe (one "no" pins the resource to "no"); the +// rs-discard-granularity is the SMALLEST across the volumes so the +// single resource-scope value stays a valid multiple of every device's +// discard granularity — never an UNDER-aligned UNMAP. A volume whose +// device can't be probed simply contributes no granularity (DRBD falls +// back to a full copy where it can't discard). // // A diskless local replica (no diskful volumes) returns nil — there is // no backing device to discard against. @@ -114,11 +118,6 @@ func (r *Reconciler) autoDiskOptionsForResource( device := devices[vol.GetVolumeNumber()] volOpts := autoDiskOptions(ctx, exec, provider.Kind(), device) - if len(volOpts) == 0 { - // This volume is not discard-zero-safe — omit the whole - // block so no volume gets an unsafe option. - return nil - } mergeResourceDiscardOptions(out, volOpts) @@ -133,21 +132,34 @@ func (r *Reconciler) autoDiskOptionsForResource( } // mergeResourceDiscardOptions folds one volume's auto disk options into -// the resource-scope accumulator. discard-zeroes-if-aligned is a flat -// flag (all discard-zero-safe volumes set "yes"). -// rs-discard-granularity collapses to the SMALLEST seen value so the -// single resource-scope number is a valid multiple of every volume's -// backing-device granularity. +// the resource-scope accumulator (a single `disk { }` block DRBD applies +// to every volume of the resource): +// +// - discard-zeroes-if-aligned collapses CONSERVATIVELY to "yes" only +// when EVERY volume is discard-zero-safe; a single "no" volume pins +// the whole resource to "no". Otherwise a thick/file volume sharing +// the resource could be told an aligned discard yields zero when it +// does not — a data-safety violation. (Today resources are +// single-volume, but the merge must stay safe for the multi-volume +// future.) +// - rs-discard-granularity collapses to the SMALLEST seen value so the +// single resource-scope number stays a valid multiple of every +// volume's backing-device granularity (never an UNDER-aligned UNMAP). func mergeResourceDiscardOptions(acc, volOpts map[string]string) { for key, val := range volOpts { - if key != diskOptRsDiscardGranularity { - acc[key] = val - - continue - } - - prev, have := acc[key] - if !have || smallerNumeric(val, prev) { + switch key { + case diskOptRsDiscardGranularity: + prev, have := acc[key] + if !have || smallerNumeric(val, prev) { + acc[key] = val + } + case diskOptDiscardZeroesAligned: + // "no" wins: once any volume is unsafe the resource flag + // must stay "no". Only set "yes" if not yet pinned to "no". + if acc[key] != drbdNo { + acc[key] = val + } + default: acc[key] = val } } @@ -172,54 +184,65 @@ func smallerNumeric(left, right string) bool { // (sans the `DrbdOptions/Disk/` prefix) ready to merge into the // rendered `disk { }` block. // +// The two options are gated INDEPENDENTLY — this mirrors upstream +// LINSTOR, where `rs-discard-granularity` follows the backing device's +// reported discard granularity (CtrlRscDfnApiCallHelper) and +// `discard-zeroes-if-aligned` follows a provider-kind switch +// (CtrlRscCrtApiHelper). Coupling them (omitting the granularity just +// because a provider isn't discard-zero-safe) makes a partially-written +// FILE_THIN volume resync the WHOLE device instead of only the written +// bytes — the Q3 corner-case bug. +// // Data-safety contract — the optimisation may ONLY skip provably-zero -// ranges (thin discard), NEVER any written data. The gates here are -// the same ones upstream LINSTOR uses: +// ranges (thin discard), NEVER any written data: // -// - discard-zeroes-if-aligned is enabled ONLY for provider kinds -// whose backing device deterministically reads/discards as zero in -// an ALIGNED region: LVM_THIN, ZFS, ZFS_THIN. NOT for FILE_THIN -// (loop-backed sparse files do not reliably honour aligned discard -// → zero), NOT for thick LVM / plain FILE / DISKLESS. This matches -// upstream's CtrlRscCrtApiHelper provider switch exactly. Note we -// deliberately do NOT reuse IsThinOrZFS here: that helper includes -// FILE_THIN (correct for the seed-GI skip-sync gate) but FILE_THIN -// must NOT get discard-zeroes-if-aligned. +// - discard-zeroes-if-aligned is `yes` ONLY for provider kinds whose +// backing device deterministically reads/discards as zero in an +// ALIGNED region: LVM_THIN, ZFS, ZFS_THIN. For every other kind +// (FILE_THIN loop-backed sparse file, thick LVM, plain FILE, +// unknown) it is `no` — matching upstream's CtrlRscCrtApiHelper +// switch, which renders an explicit `no`. Note we deliberately do +// NOT reuse IsThinOrZFS here: that helper includes FILE_THIN +// (correct for the seed-GI skip-sync gate) but FILE_THIN must NOT +// get discard-zeroes-if-aligned=yes. // -// - rs-discard-granularity is set ONLY when the backing block device -// actually reports a non-zero discard granularity (lsblk DISC-GRAN -// > 0). A device that reports 0 does not support discard — setting -// a non-zero granularity there would make drbdsetup reject the -// option (or, worse, attempt unsupported UNMAPs). When the -// granularity can't be determined or is 0, we OMIT the key -// entirely and DRBD falls back to a full byte-copy resync — always -// safe. +// - rs-discard-granularity is set whenever the backing block device +// reports a non-zero discard granularity (lsblk DISC-GRAN > 0), +// INDEPENDENT of provider kind. This is the safe & correct gate: +// a non-zero DISC-GRAN means the device supports discard, so DRBD +// can issue an aligned UNMAP for an all-zero region during resync +// instead of writing zeros. (`discard-zeroes-if-aligned` separately +// governs whether DRBD may TREAT an aligned all-zero range as a +// discard on the SyncTarget; with it `no`, DRBD still benefits from +// the granularity on the SyncSource side.) A device reporting 0 +// does not support discard — emitting a non-zero granularity there +// would make drbdsetup reject the option, so we OMIT the key and +// DRBD falls back to a full byte-copy resync. // -// When in doubt the map omits a key (full transfer), never the -// reverse. An empty/nil return means "render no auto disk options". +// When in doubt the map omits the granularity key (full transfer), +// never the reverse. The discard-zeroes flag is ALWAYS present (yes or +// no), mirroring upstream which renders it explicitly. An empty return +// would mean "render no disk block"; this function returns at least the +// discard-zeroes flag for any diskful volume, so the block renders +// whenever a backing device is present. func autoDiskOptions( ctx context.Context, exec storage.Exec, providerKind string, devicePath string, ) map[string]string { - if !discardZeroesIfAligned(providerKind) { - // Thick LVM / plain FILE / FILE_THIN / DISKLESS / unknown: - // the backing device cannot be trusted to discard-to-zero on - // an aligned range. Emit nothing — DRBD keeps its safe - // full-copy resync. - return nil + zeroesVal := drbdNo + if discardZeroesIfAligned(providerKind) { + zeroesVal = drbdYes } out := map[string]string{ - diskOptDiscardZeroesAligned: drbdYes, + diskOptDiscardZeroesAligned: zeroesVal, } // Only attempt the granularity when we have a real device path to // probe. A diskless local replica (devicePath == "") has no - // backing device — discard-zeroes-if-aligned alone is harmless - // (it governs how DRBD treats discards it receives) and we skip - // the granularity probe. + // backing device — skip the granularity probe. if devicePath == "" { return out } diff --git a/pkg/satellite/discardgran_test.go b/pkg/satellite/discardgran_test.go index 5f69605b..eafb68fc 100644 --- a/pkg/satellite/discardgran_test.go +++ b/pkg/satellite/discardgran_test.go @@ -182,35 +182,67 @@ func TestAutoDiskOptions_ProbeFailureOmitsGranularity(t *testing.T) { } } -// TestAutoDiskOptions_ThickOmitsEverything: thick LVM is NOT -// discard-zero safe — NO disk options at all (full safe resync). The -// device is never even probed. -func TestAutoDiskOptions_ThickOmitsEverything(t *testing.T) { +// TestAutoDiskOptions_ThickNoZeroesButGranularity: thick LVM is NOT +// discard-zero safe → discard-zeroes-if-aligned=no, but the granularity +// is gated INDEPENDENTLY on the backing device's reported DISC-GRAN. A +// thick LV whose device supports discard still gets rs-discard-granularity +// so an aligned all-zero region can be UNMAPped during resync. +func TestAutoDiskOptions_ThickNoZeroesButGranularity(t *testing.T) { fx := storage.NewFakeExec() + discGranResponse(fx, "/dev/vg/pvc-1_00000", "65536") opts := autoDiskOptions(context.Background(), fx, ProviderKindLVM, "/dev/vg/pvc-1_00000") - if len(opts) != 0 { - t.Errorf("thick LVM produced disk options %v; want none", opts) + if opts[diskOptDiscardZeroesAligned] != "no" { + t.Errorf("discard-zeroes-if-aligned = %q, want no (thick)", opts[diskOptDiscardZeroesAligned]) } - for _, line := range fx.CommandLines() { - if strings.HasPrefix(line, "lsblk") { - t.Errorf("thick LVM probed the device (%q); want no probe", line) - } + if opts[diskOptRsDiscardGranularity] != "65536" { + t.Errorf("rs-discard-granularity = %q, want 65536", opts[diskOptRsDiscardGranularity]) } } -// TestAutoDiskOptions_FileThinOmitsEverything: FILE_THIN is excluded -// from discard-zeroes-if-aligned (loop-backed sparse file) — no disk -// options, matching upstream's `no` for FILE_THIN. -func TestAutoDiskOptions_FileThinOmitsEverything(t *testing.T) { +// TestAutoDiskOptions_FileThin4KLoop pins the Q3 corner-case: a +// partially-written FILE_THIN volume on a 4 KiB-discard loop device. +// discard-zeroes-if-aligned is `no` (loop-backed sparse file, NOT +// zero-discard safe), but rs-discard-granularity MUST be emitted (4096) +// because the backing loop device reports a non-zero discard +// granularity — DECOUPLED from the discard-zeroes gate. Without this +// blockstor resynced ~2x the bytes upstream did. Target render matches +// upstream 1.33.2: `discard-zeroes-if-aligned no; rs-discard-granularity +// 4096;`. +func TestAutoDiskOptions_FileThin4KLoop(t *testing.T) { fx := storage.NewFakeExec() + discGranResponse(fx, "/var/lib/blockstor/pvc-1.img", "4096") opts := autoDiskOptions(context.Background(), fx, ProviderKindFileThin, "/var/lib/blockstor/pvc-1.img") - if len(opts) != 0 { - t.Errorf("FILE_THIN produced disk options %v; want none", opts) + if opts[diskOptDiscardZeroesAligned] != "no" { + t.Errorf("discard-zeroes-if-aligned = %q, want no (FILE_THIN)", opts[diskOptDiscardZeroesAligned]) + } + + if opts[diskOptRsDiscardGranularity] != "4096" { + t.Errorf("rs-discard-granularity = %q, want 4096 (decoupled from discard-zeroes)", opts[diskOptRsDiscardGranularity]) + } +} + +// TestAutoDiskOptions_FileThinZeroGranNothingUseful: a FILE_THIN volume +// whose backing device reports DISC-GRAN=0 (no discard support) gets +// discard-zeroes-if-aligned=no and NO granularity — DRBD falls back to a +// safe full resync. The disk block still renders the flag (matching +// upstream's explicit `no`). +func TestAutoDiskOptions_FileThinZeroGranNothingUseful(t *testing.T) { + fx := storage.NewFakeExec() + discGranResponse(fx, "/var/lib/blockstor/pvc-1.img", "0") + + opts := autoDiskOptions(context.Background(), fx, ProviderKindFileThin, "/var/lib/blockstor/pvc-1.img") + + if opts[diskOptDiscardZeroesAligned] != "no" { + t.Errorf("discard-zeroes-if-aligned = %q, want no", opts[diskOptDiscardZeroesAligned]) + } + + if _, present := opts[diskOptRsDiscardGranularity]; present { + t.Errorf("rs-discard-granularity present (%q) for DISC-GRAN=0; want omitted", opts[diskOptRsDiscardGranularity]) } } @@ -269,10 +301,13 @@ func TestAutoDiskOptionsForResource_ZfsThin(t *testing.T) { } } -// TestAutoDiskOptionsForResource_ThickNil: a thick LVM pool yields no -// auto disk options. -func TestAutoDiskOptionsForResource_ThickNil(t *testing.T) { +// TestAutoDiskOptionsForResource_ThickNoZeroes: a thick LVM pool yields +// discard-zeroes-if-aligned=no but still picks up rs-discard-granularity +// from the backing device (independent gate). +func TestAutoDiskOptionsForResource_ThickNoZeroes(t *testing.T) { fx := storage.NewFakeExec() + device := "/dev/vg/pvc-1_00000" + discGranResponse(fx, device, "65536") prov := lvm.NewThick(lvm.ThickConfig{VolumeGroup: "vg"}, fx) rec := NewReconciler(ReconcilerConfig{ Providers: map[string]storage.Provider{"thick1": prov}, @@ -280,17 +315,21 @@ func TestAutoDiskOptionsForResource_ThickNil(t *testing.T) { }) opts := rec.autoDiskOptionsForResource(context.Background(), drFor("pvc-1", "thick1"), - map[int32]string{0: "/dev/vg/pvc-1_00000"}) + map[int32]string{0: device}) - if len(opts) != 0 { - t.Fatalf("thick LVM resource produced %v; want none", opts) + if opts[diskOptDiscardZeroesAligned] != "no" || opts[diskOptRsDiscardGranularity] != "65536" { + t.Fatalf("thick LVM resource = %v; want no + 65536", opts) } } -// TestAutoDiskOptionsForResource_FileThinNil: FILE_THIN yields no auto -// disk options (excluded from discard-zeroes-if-aligned). -func TestAutoDiskOptionsForResource_FileThinNil(t *testing.T) { +// TestAutoDiskOptionsForResource_FileThin4KLoop: the Q3 corner-case at +// the resource level — FILE_THIN on a 4 KiB-discard loop yields +// discard-zeroes-if-aligned=no + rs-discard-granularity=4096, so a +// partially-written volume resyncs only the written bytes. +func TestAutoDiskOptionsForResource_FileThin4KLoop(t *testing.T) { fx := storage.NewFakeExec() + device := "/var/lib/blockstor/pvc-1.img" + discGranResponse(fx, device, "4096") prov := file.NewProvider(file.Config{Dir: t.TempDir(), Thin: true}, fx) rec := NewReconciler(ReconcilerConfig{ Providers: map[string]storage.Provider{"file1": prov}, @@ -298,10 +337,10 @@ func TestAutoDiskOptionsForResource_FileThinNil(t *testing.T) { }) opts := rec.autoDiskOptionsForResource(context.Background(), drFor("pvc-1", "file1"), - map[int32]string{0: "/var/lib/blockstor/pvc-1.img"}) + map[int32]string{0: device}) - if len(opts) != 0 { - t.Fatalf("FILE_THIN resource produced %v; want none", opts) + if opts[diskOptDiscardZeroesAligned] != "no" || opts[diskOptRsDiscardGranularity] != "4096" { + t.Fatalf("FILE_THIN resource = %v; want no + 4096", opts) } } @@ -407,6 +446,47 @@ func TestBuildResFile_RendersDiscardDiskBlock(t *testing.T) { } } +// TestBuildResFile_RendersFileThinNoZeroesBlock pins the Q3 rendered +// .res: a FILE_THIN-on-4K-loop volume renders a disk block with +// `discard-zeroes-if-aligned no;` AND `rs-discard-granularity 4096;` — +// matching upstream 1.33.2's render for the same backing. +func TestBuildResFile_RendersFileThinNoZeroesBlock(t *testing.T) { + dr := &intent.DesiredResource{ + Name: "pvc-ft", + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "file1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n2"}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + "peer.n2.address": "10.0.0.2", "peer.n2.node-id": "1", "peer.n2.port": "7000", + }, + } + devices := map[int32]string{0: "/var/lib/blockstor/pvc-ft.img"} + autoDisk := map[string]string{ + diskOptDiscardZeroesAligned: "no", + diskOptRsDiscardGranularity: "4096", + } + + body, err := buildResFile(dr, "n1", "10.0.0.1", devices, autoDisk) + if err != nil { + t.Fatalf("buildResFile: %v", err) + } + + if !strings.Contains(body, "disk {") { + t.Fatalf("no disk block in:\n%s", body) + } + + if !strings.Contains(body, "discard-zeroes-if-aligned no;") { + t.Errorf("missing discard-zeroes-if-aligned no; in:\n%s", body) + } + + if !strings.Contains(body, "rs-discard-granularity 4096;") { + t.Errorf("missing rs-discard-granularity 4096; in:\n%s", body) + } +} + // TestBuildResFile_NoAutoDiskNoBlock: with no auto disk options and no // operator disk props, no `disk { }` block is rendered (thick/file // safe path). From 0071b4b72087b78cc01402ee56f466018f6c9982 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 5 Jun 2026 22:54:13 +0200 Subject: [PATCH 2/5] test(corner-q3): pin FILE_THIN rs-discard-granularity at L6 + L7 L6 cli-matrix cell file-thin-rs-discard-granularity.sh: 512M FILE_THIN single replica, write 320M, add 2nd diskful replica, assert the new replica's resync `received` byte counter is close to the written bytes (not the full device) and the rendered .res carries the discard disk block. L7 replay file-thin-rs-discard-granularity.yaml: operator-CLI sequence asserting the rendered DRBD config carries rs-discard-granularity=4096 and discard-zeroes-if-aligned=no for a FILE_THIN resource, plus a clean 3rd-replica sync (drbd_option + sync_clean awaits). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../file-thin-rs-discard-granularity.sh | 131 ++++++++++++++++++ .../file-thin-rs-discard-granularity.yaml | 79 +++++++++++ 2 files changed, 210 insertions(+) create mode 100755 tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh create mode 100644 tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml diff --git a/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh b/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh new file mode 100755 index 00000000..7bd45450 --- /dev/null +++ b/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +# +# usage: file-thin-rs-discard-granularity.sh WORK_DIR +# +# L6 cli-matrix cell — corner-case Q3. +# +# Reproduction: a 512 MiB FILE_THIN resource, only ~320 MiB of it +# written, gains a 3rd replica. The new replica's resync MUST transfer +# only the written bytes (~320 MiB), NOT the whole 512 MiB device — +# matching upstream LINSTOR 1.33.2 on the same loop backing. +# +# Pre-fix blockstor rendered NO `disk { }` block for FILE_THIN because +# the satellite coupled rs-discard-granularity to the +# discard-zeroes-if-aligned provider gate (which correctly excludes +# FILE_THIN). Without rs-discard-granularity DRBD cannot UNMAP the +# all-zero unwritten ranges during resync, so it copies the whole device +# (~2x the bytes; measured 524012 KiB vs upstream's 327680 KiB). +# +# Contract — assert all three legs: +# 1. the rendered .res on the diskful node carries a `disk { }` block +# with `rs-discard-granularity` and `discard-zeroes-if-aligned no`; +# 2. the new replica converges to UpToDate (clean sync); +# 3. the resync TARGET's `received` byte counter is close to the +# written bytes, NOT to the full device size (the core win). + +set -euo pipefail + +WORK_DIR=${1:?work_dir required} +export KUBECONFIG="$WORK_DIR/kubeconfig" + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=lib.sh +source "$SCRIPT_DIR/lib.sh" + +require_workers 2 + +linstor_cli_setup + +RD=cli-matrix-q3 +SP=stand + +# 512 MiB volume, write 320 MiB, expect resync ≈ written. +VOL_MIB=512 +WRITTEN_MIB=320 +# Upper bound for "received" on the sync target: the written bytes plus +# generous slack for DRBD bitmap/activity-log overhead and rounding. +# A full-device copy would be ~524288 KiB, which this comfortably +# excludes; the pre-fix bug measured 524012 KiB. +MAX_RECEIVED_KIB=$(( (WRITTEN_MIB + 96) * 1024 )) + +cleanup() { + delete_rd "$RD" + assert_no_orphans "$RD" + linstor_cli_teardown +} +trap cleanup EXIT + +N1=$WORKER_1 +N2=$WORKER_2 + +echo ">> [Q3] 512M FILE_THIN single diskful replica on $N1" +"${LCTL[@]}" resource-definition create "$RD" >/dev/null +"${LCTL[@]}" volume-definition create "$RD" "${VOL_MIB}M" >/dev/null +"${LCTL[@]}" resource create "$N1" "$RD" --storage-pool="$SP" >/dev/null + +echo ">> wait $N1 UpToDate" +wait_disk_state "$RD" "$N1" UpToDate 120 + +echo ">> assert rendered .res on $N1 carries the discard disk block" +RES=$(on_node "$N1" cat "/etc/drbd.d/${RD}.res") +if ! grep -q "disk {" <<<"$RES"; then + echo "FAIL: no disk { } block in rendered .res for FILE_THIN:" >&2 + echo "$RES" >&2 + exit 1 +fi +if ! grep -Eq "rs-discard-granularity[[:space:]]+[0-9]+;" <<<"$RES"; then + echo "FAIL: rendered .res lacks rs-discard-granularity:" >&2 + echo "$RES" >&2 + exit 1 +fi +if ! grep -Eq "discard-zeroes-if-aligned[[:space:]]+no;" <<<"$RES"; then + echo "FAIL: rendered .res lacks discard-zeroes-if-aligned no (FILE_THIN):" >&2 + echo "$RES" >&2 + exit 1 +fi +echo " OK: $(grep -E 'rs-discard-granularity|discard-zeroes-if-aligned' <<<"$RES" | tr -s ' \t' ' ')" + +echo ">> write ${WRITTEN_MIB} MiB pattern into the volume on $N1" +# /dev/drbd is the device; resolve the minor from the .res. +MINOR=$(grep -Eo 'minor[[:space:]]+[0-9]+' <<<"$RES" | head -1 | grep -Eo '[0-9]+') +if [[ -z "$MINOR" ]]; then + echo "FAIL: could not resolve DRBD minor from .res" >&2 + exit 1 +fi +on_node "$N1" dd if=/dev/urandom "of=/dev/drbd${MINOR}" bs=1M count="$WRITTEN_MIB" \ + oflag=direct conv=fsync status=none +echo " wrote ${WRITTEN_MIB} MiB to /dev/drbd${MINOR}" + +echo ">> add 3rd... 2nd diskful replica on $N2 (triggers resync)" +"${LCTL[@]}" resource create "$N2" "$RD" --storage-pool="$SP" >/dev/null + +echo ">> wait $N2 UpToDate (clean sync)" +wait_disk_state "$RD" "$N2" UpToDate 300 + +echo ">> read resync-received byte counter on the sync target ($N2)" +# `drbdsetup status --statistics` reports per-peer-device `received:` +# in bytes (total received over the connection — dominated here by the +# initial resync). Sum across peer devices to be robust. +RECEIVED_BYTES=$(on_node "$N2" drbdsetup status "$RD" --statistics --json 2>/dev/null \ + | jq '[.[].connections[].peer_devices[].received // 0] | add' 2>/dev/null || echo "") +if [[ -z "$RECEIVED_BYTES" || "$RECEIVED_BYTES" == "null" ]]; then + # Text fallback: parse `received:` from drbdsetup status. + RECEIVED_BYTES=$(on_node "$N2" drbdsetup status "$RD" --statistics 2>/dev/null \ + | grep -oE 'received:[0-9]+' | head -1 | cut -d: -f2) +fi +if [[ -z "$RECEIVED_BYTES" ]]; then + echo "FAIL: could not read received byte counter on $N2" >&2 + exit 1 +fi +RECEIVED_KIB=$(( RECEIVED_BYTES / 1024 )) + +echo " sync target received: ${RECEIVED_KIB} KiB (written ${WRITTEN_MIB} MiB," +echo " device ${VOL_MIB} MiB; pre-fix bug copied the whole device)" + +if (( RECEIVED_KIB > MAX_RECEIVED_KIB )); then + echo "FAIL: sync target received ${RECEIVED_KIB} KiB > ${MAX_RECEIVED_KIB} KiB" >&2 + echo " => DRBD copied the unwritten zero ranges (rs-discard-granularity not honoured)" >&2 + exit 1 +fi + +echo "PASS: Q3 — FILE_THIN resync transferred only the written bytes" diff --git a/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml b/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml new file mode 100644 index 00000000..f9d45f81 --- /dev/null +++ b/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml @@ -0,0 +1,79 @@ +name: file-thin-rs-discard-granularity +description: | + Corner-case Q3: a FILE_THIN resource on a 4 KiB-discard loop backing + MUST render `rs-discard-granularity` into its DRBD disk options so a + partially-written volume resyncs only the written bytes (upstream + parity), NOT the whole device. + + Pre-fix blockstor coupled rs-discard-granularity to the + discard-zeroes-if-aligned provider gate and so emitted NO disk block at + all for FILE_THIN — a 3rd-replica add resynced ~2x the bytes upstream + LINSTOR did. This workflow asserts, at the operator-CLI level, that: + + 1. the rendered DRBD config on the diskful node carries + rs-discard-granularity (drbd_option, live `drbdsetup show`); + 2. discard-zeroes-if-aligned is `no` for FILE_THIN (provider gate + unchanged); and + 3. adding a 3rd replica converges to a clean UpToDate sync. + + The byte-count win itself is measured by the L6 cli-matrix cell + `file-thin-rs-discard-granularity.sh` (it dd's a partial pattern and + compares received-KiB before vs after the fix). + +prerequisites: + min_nodes: 3 + storage_pool: stand + +vars: + sp: stand + +steps: + - name: create-rd + cmd: ["resource-definition", "create", "{{rd}}"] + expect_exit: 0 + - name: create-vd + cmd: ["volume-definition", "create", "{{rd}}", "512M"] + expect_exit: 0 + - name: r-c-diskful-node1 + cmd: ["resource", "create", "{{node1}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + await: + kind: disk_state + rd: "{{rd}}" + node: "{{node1}}" + expected: UpToDate + timeout_s: 120 + - name: assert-rs-discard-granularity-rendered + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: drbd_option + rd: "{{rd}}" + node: "{{node1}}" + key: "rs-discard-granularity" + expected: "4096" + timeout_s: 60 + - name: assert-discard-zeroes-no + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: drbd_option + rd: "{{rd}}" + node: "{{node1}}" + key: "discard-zeroes-if-aligned" + expected: "no" + timeout_s: 60 + - name: r-c-diskful-node2 + cmd: ["resource", "create", "{{node2}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + await: + kind: sync_clean + rd: "{{rd}}" + node: "{{node2}}" + timeout_s: 300 + +teardown: + - cmd: ["resource-definition", "delete", "{{rd}}"] + +invariants: + - no_orphans From 3a8f865d6f6479e95a1166faa7eba2aa3afbd260 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 5 Jun 2026 22:58:32 +0200 Subject: [PATCH 3/5] docs(cli-parity): record FILE_THIN disk-block render deltas (Q3 #76) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the rs-discard-granularity decoupling fix, BS renders the same thin-aware-resync disk options as upstream for a discard-capable FILE_THIN volume. Two render-shape divergences remain and are accepted: upstream additionally emits `block-size 512;` (BS omits it), and upstream renders the disk block at volume scope vs BS's resource scope — functionally identical for single-volume resources. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- docs/cli-parity-known-deltas.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cli-parity-known-deltas.md b/docs/cli-parity-known-deltas.md index ef565f2f..f0932c35 100644 --- a/docs/cli-parity-known-deltas.md +++ b/docs/cli-parity-known-deltas.md @@ -41,6 +41,7 @@ Row IDs match the command-catalogue indexes used by `cli-parity-refresh.sh` (see | 70 | `rd lp ` (inherited parent-scope props shown inline) | WIRE_SHAPE | permanent | Corner-case I2. Upstream LINSTOR treats inheritance as a *reference, not a copy*: a property set on a resource-group / controller affects existing children, but the child's `list-properties` does NOT show the inherited key inline (UG9 ~4587: "As the QoS properties are inherited and not copied, you will not see the property listed in any child objects"). BS deliberately INLINES inherited Controller/RG-scope keys into the RD's bare `props` map (stampRDEffectiveProps) so `linstor rd lp` renders them — the python CLI's `rd lp` table reads `props`, and surfacing the effective value there is more operator-friendly than an empty table. The inheritance-is-a-reference half MATCHES (props resolve at read time via effectiveprops.Resolve, so a post-spawn RG change is reflected); only the inline-visibility in `rd lp` diverges, by design. Sensitive controller-scope keys are still redacted (Bug 115). Pinned by `TestCornerI2_InheritanceIsReferenceAndShownInline`. | | 71 | `controller TcpPortAutoRange` / DRBD port + minor base | BEHAVIOR | permanent | Corner-case K3. Upstream LINSTOR's `TcpPortAutoRange` defaults to 7000-7999 and DRBD minors to 1000+ (UG9 ~2225-2231). blockstor deliberately allocates DRBD TCP ports from 20000-20999 and minors from 20000-28999 (`drbd.DefaultPortMin/Max`, `dispatcher.derivePort/deriveMinor`) so a blockstor cluster can run on the same nodes as a live upstream LINSTOR without colliding on the shared kernel `/dev/drbd` minor / TCP-port namespace. The override knob is LINSTOR-compatible (`linstor n set-property DrbdOptions/TcpPortRange "-"` via `ParseRange`); only the DEFAULT base differs. Reuse-after-`rd d` MATCHES upstream: a deleted resource's port/minor re-enters the cluster taken-set and the deterministic lowest-free allocator hands it back. Pinned by `pkg/drbd/portpool_test.go::TestK3_{DefaultPortRangeIsBlockstorWindow,PortReusedAfterDelete}`. | | 74 | `r td --cancel` (stuck-toggle recovery) | BEHAVIOR | permanent | Corner-case H1. VERSION_CONTEXT: upstream LINSTOR ≥1.34.0 recovers a stuck toggle by re-issuing the SAME command (retry) or the OPPOSITE command (cancel). The dev-stand oracle is 1.33.2 (pre-1.34; rejects with a conflict). blockstor satisfies the 1.34 contract: a repeated toggle is idempotent (retry), and the opposite-direction toggle drives the satellite demote path (`applyStorageIfDiskful` → detach + `reclaimVolumesForDiskless`), unwinding the half-carved storage (cancel). blockstor ALSO accepts an explicit `?cancel=true` query (Bug 40) the upstream client never sends — a superset, not a divergence. Pinned by `pkg/rest/resource_toggle_disk_test.go::TestToggleDiskCancelStuckAddDiskScenario4W24` + `pkg/satellite/reconciler_toggle_diskless_test.go::TestBug267ToggleToDisklessReclaimsBackingVolume`. | +| 76 | FILE_THIN `.res` disk block — `block-size` omitted, scope resource vs volume | WIRE_SHAPE | permanent | Corner-case Q3. For a discard-capable FILE_THIN volume BS now renders the same thin-aware-resync disk options upstream does — `discard-zeroes-if-aligned no;` + `rs-discard-granularity ;` (verified byte-identical on a 4 KiB loop: oracle 1.33.2 and BS both emit `4096`). Two render-shape divergences accepted: (a) upstream 1.33.2 additionally emits `block-size 512;` inside the disk block (backing logical-block-size hint, unrelated to discard/resync); BS omits it — DRBD defaults to the backing device's block size, so resync behaviour is identical. (b) Upstream renders the disk block at VOLUME scope (`volume 0 { disk { … } }`); BS renders it at RESOURCE scope (`disk { … }`), which drbdadm applies to every volume — functionally identical for single-volume resources. Pinned by `pkg/satellite/discardgran_test.go::TestAutoDiskOptions_FileThin4KLoop` + `TestBuildResFile_RendersFileThinNoZeroesBlock` (L1), `tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh` (L6), `tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml` (L7). | | 75 | `DrbdOptions/auto-diskful` trigger condition | BEHAVIOR | 2026-12-31 | Corner-case H4. PARTIAL: blockstor honours the property and the RD>RG>Controller priority hierarchy (`internal/controller/auto_diskful_controller.go`), but the firing CONDITION diverges from upstream's documented "Diskless node held DRBD Primary > N minutes". blockstor arms the N-minute timer on a diskful-replica DEFICIT (count < place_count) and additionally promotes a Primary-InUse diskless replica immediately (`ResourceReconciler.maybeAutoDiskful`, no timer, property-independent). The `DrbdOptions/auto-diskful-allow-cleanup` excess-secondary trim is unimplemented (reserved key only). Property plumbing + hierarchy pinned by `internal/controller/auto_diskful_timer_test.go`. Full timed-Primary state machine + cleanup gate tracked as follow-up. | ## Open (block merge until addressed) From ac33480cdeaddc7b9ecc3bf5628f786d431150d7 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sat, 6 Jun 2026 02:22:03 +0200 Subject: [PATCH 4/5] fix(satellite): gate rs-discard-granularity to block-device thin pools (FILE_THIN loop wedge) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rendering rs-discard-granularity into a loop-backed FILE_THIN volume's disk{} block regressed fresh-create convergence. When the elected day0 winner force-primaries to run mkfs (the FileSystem/Type path), mkfs issues a full-device discard; on the loop backing that discard storm, with rs-discard-granularity active, dirties the bitmap relative to the day0-seeded peers and forces a FULL initial SyncTarget that wedges (PausedSyncT dependency between the two targets). Proven on stand + CI: an identical 3-replica create + mkfs converges instantly on LVM_THIN (real block device, same disk block) but full-resyncs and wedges on FILE_THIN (loop) — the e2e respawn-standalone-wedge failure. Gate rs-discard-granularity on the discard-zero-safe block-device provider set (LVM_THIN / ZFS / ZFS_THIN), the same set as discard-zeroes-if-aligned. FILE_THIN now renders only the inert discard-zeroes-if-aligned no (matching pre-feature behaviour) and omits the granularity. The thin-aware-resync win is retained where it is both safe and effective: real block-device thin/ZFS pools. L1 pins (discardgran_test.go): FILE_THIN + thick LVM omit the granularity; LVM_THIN/ZFS keep it; end-to-end render pins TestBuildResFile_FileThinDay0SkipRender (no rs-discard-granularity) and TestBuildResFile_LvmThinDay0SkipRender (keeps it). L6/L7 repurposed to guard the FILE_THIN fresh-create + mkfs day0-skip convergence and assert rs-discard-granularity is absent. Known-delta #76 updated. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- docs/cli-parity-known-deltas.md | 2 +- pkg/satellite/discardgran.go | 71 ++++-- pkg/satellite/discardgran_test.go | 208 +++++++++++++----- .../file-thin-rs-discard-granularity.sh | 107 +++++---- .../file-thin-rs-discard-granularity.yaml | 57 +++-- 5 files changed, 295 insertions(+), 150 deletions(-) diff --git a/docs/cli-parity-known-deltas.md b/docs/cli-parity-known-deltas.md index f0932c35..5c61769a 100644 --- a/docs/cli-parity-known-deltas.md +++ b/docs/cli-parity-known-deltas.md @@ -41,7 +41,7 @@ Row IDs match the command-catalogue indexes used by `cli-parity-refresh.sh` (see | 70 | `rd lp ` (inherited parent-scope props shown inline) | WIRE_SHAPE | permanent | Corner-case I2. Upstream LINSTOR treats inheritance as a *reference, not a copy*: a property set on a resource-group / controller affects existing children, but the child's `list-properties` does NOT show the inherited key inline (UG9 ~4587: "As the QoS properties are inherited and not copied, you will not see the property listed in any child objects"). BS deliberately INLINES inherited Controller/RG-scope keys into the RD's bare `props` map (stampRDEffectiveProps) so `linstor rd lp` renders them — the python CLI's `rd lp` table reads `props`, and surfacing the effective value there is more operator-friendly than an empty table. The inheritance-is-a-reference half MATCHES (props resolve at read time via effectiveprops.Resolve, so a post-spawn RG change is reflected); only the inline-visibility in `rd lp` diverges, by design. Sensitive controller-scope keys are still redacted (Bug 115). Pinned by `TestCornerI2_InheritanceIsReferenceAndShownInline`. | | 71 | `controller TcpPortAutoRange` / DRBD port + minor base | BEHAVIOR | permanent | Corner-case K3. Upstream LINSTOR's `TcpPortAutoRange` defaults to 7000-7999 and DRBD minors to 1000+ (UG9 ~2225-2231). blockstor deliberately allocates DRBD TCP ports from 20000-20999 and minors from 20000-28999 (`drbd.DefaultPortMin/Max`, `dispatcher.derivePort/deriveMinor`) so a blockstor cluster can run on the same nodes as a live upstream LINSTOR without colliding on the shared kernel `/dev/drbd` minor / TCP-port namespace. The override knob is LINSTOR-compatible (`linstor n set-property DrbdOptions/TcpPortRange "-"` via `ParseRange`); only the DEFAULT base differs. Reuse-after-`rd d` MATCHES upstream: a deleted resource's port/minor re-enters the cluster taken-set and the deterministic lowest-free allocator hands it back. Pinned by `pkg/drbd/portpool_test.go::TestK3_{DefaultPortRangeIsBlockstorWindow,PortReusedAfterDelete}`. | | 74 | `r td --cancel` (stuck-toggle recovery) | BEHAVIOR | permanent | Corner-case H1. VERSION_CONTEXT: upstream LINSTOR ≥1.34.0 recovers a stuck toggle by re-issuing the SAME command (retry) or the OPPOSITE command (cancel). The dev-stand oracle is 1.33.2 (pre-1.34; rejects with a conflict). blockstor satisfies the 1.34 contract: a repeated toggle is idempotent (retry), and the opposite-direction toggle drives the satellite demote path (`applyStorageIfDiskful` → detach + `reclaimVolumesForDiskless`), unwinding the half-carved storage (cancel). blockstor ALSO accepts an explicit `?cancel=true` query (Bug 40) the upstream client never sends — a superset, not a divergence. Pinned by `pkg/rest/resource_toggle_disk_test.go::TestToggleDiskCancelStuckAddDiskScenario4W24` + `pkg/satellite/reconciler_toggle_diskless_test.go::TestBug267ToggleToDisklessReclaimsBackingVolume`. | -| 76 | FILE_THIN `.res` disk block — `block-size` omitted, scope resource vs volume | WIRE_SHAPE | permanent | Corner-case Q3. For a discard-capable FILE_THIN volume BS now renders the same thin-aware-resync disk options upstream does — `discard-zeroes-if-aligned no;` + `rs-discard-granularity ;` (verified byte-identical on a 4 KiB loop: oracle 1.33.2 and BS both emit `4096`). Two render-shape divergences accepted: (a) upstream 1.33.2 additionally emits `block-size 512;` inside the disk block (backing logical-block-size hint, unrelated to discard/resync); BS omits it — DRBD defaults to the backing device's block size, so resync behaviour is identical. (b) Upstream renders the disk block at VOLUME scope (`volume 0 { disk { … } }`); BS renders it at RESOURCE scope (`disk { … }`), which drbdadm applies to every volume — functionally identical for single-volume resources. Pinned by `pkg/satellite/discardgran_test.go::TestAutoDiskOptions_FileThin4KLoop` + `TestBuildResFile_RendersFileThinNoZeroesBlock` (L1), `tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh` (L6), `tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml` (L7). | +| 76 | FILE_THIN `.res` disk block — `rs-discard-granularity` omitted (loop-safety), `block-size` omitted, scope resource vs volume | WIRE_SHAPE | permanent | Corner-case Q3. For a FILE_THIN volume BS renders only `discard-zeroes-if-aligned no;` and DELIBERATELY OMITS `rs-discard-granularity`, even though the loop backing reports a non-zero DISC-GRAN and upstream 1.33.2 emits `rs-discard-granularity 4096;`. Reason: rendering the granularity on a loop-backed FILE_THIN volume regresses blockstor's day0 GI-skip on the mkfs/force-primary path — the elected winner's `mkfs` full-device discard, with rs-discard-granularity active, dirties the bitmap relative to the day0-seeded peers and forces a FULL initial SyncTarget that wedges (proven: instant convergence on LVM_THIN with the same disk block, full-resync wedge on FILE_THIN; the e2e respawn-standalone-wedge regression). The thin-aware-resync win (written-bytes-only resync) is retained on the discard-zero-safe BLOCK-device pools (LVM_THIN / ZFS / ZFS_THIN), where the option is both safe and effective. Additional accepted render-shape divergences: (a) upstream additionally emits `block-size 512;` (backing logical-block-size hint, unrelated to discard/resync); BS omits it — DRBD defaults to the device block size. (b) Upstream renders the disk block at VOLUME scope; BS at RESOURCE scope — functionally identical for single-volume resources. Pinned by `pkg/satellite/discardgran_test.go::TestAutoDiskOptions_FileThin4KLoop_NoGranularity` + `TestBuildResFile_FileThinDay0SkipRender` + `TestBuildResFile_LvmThinDay0SkipRender` (L1), `tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh` (L6), `tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml` (L7). | | 75 | `DrbdOptions/auto-diskful` trigger condition | BEHAVIOR | 2026-12-31 | Corner-case H4. PARTIAL: blockstor honours the property and the RD>RG>Controller priority hierarchy (`internal/controller/auto_diskful_controller.go`), but the firing CONDITION diverges from upstream's documented "Diskless node held DRBD Primary > N minutes". blockstor arms the N-minute timer on a diskful-replica DEFICIT (count < place_count) and additionally promotes a Primary-InUse diskless replica immediately (`ResourceReconciler.maybeAutoDiskful`, no timer, property-independent). The `DrbdOptions/auto-diskful-allow-cleanup` excess-secondary trim is unimplemented (reserved key only). Property plumbing + hierarchy pinned by `internal/controller/auto_diskful_timer_test.go`. Full timed-Primary state machine + cleanup gate tracked as follow-up. | ## Open (block merge until addressed) diff --git a/pkg/satellite/discardgran.go b/pkg/satellite/discardgran.go index acab3621..b23b87b0 100644 --- a/pkg/satellite/discardgran.go +++ b/pkg/satellite/discardgran.go @@ -184,14 +184,18 @@ func smallerNumeric(left, right string) bool { // (sans the `DrbdOptions/Disk/` prefix) ready to merge into the // rendered `disk { }` block. // -// The two options are gated INDEPENDENTLY — this mirrors upstream -// LINSTOR, where `rs-discard-granularity` follows the backing device's -// reported discard granularity (CtrlRscDfnApiCallHelper) and -// `discard-zeroes-if-aligned` follows a provider-kind switch -// (CtrlRscCrtApiHelper). Coupling them (omitting the granularity just -// because a provider isn't discard-zero-safe) makes a partially-written -// FILE_THIN volume resync the WHOLE device instead of only the written -// bytes — the Q3 corner-case bug. +// Both options are gated on the SAME discard-zero-safe provider set +// (LVM_THIN / ZFS / ZFS_THIN). discard-zeroes-if-aligned mirrors +// upstream's CtrlRscCrtApiHelper switch; rs-discard-granularity is +// additionally gated on that set rather than on the device's reported +// DISC-GRAN alone — see autoDiskOptions's inline rationale: a loop- +// backed FILE_THIN device reports DISC-GRAN=4096 but rendering +// rs-discard-granularity there REGRESSES fresh-create convergence on +// the mkfs/force-primary path (the wedge that broke the e2e +// respawn-standalone-wedge scenario). Keeping the option on real +// block-device thin/ZFS pools retains the genuine thin-aware-resync win +// (partially-written volume resyncs ~only the written bytes) where it is +// both safe and effective. // // Data-safety contract — the optimisation may ONLY skip provably-zero // ranges (thin discard), NEVER any written data: @@ -206,18 +210,14 @@ func smallerNumeric(left, right string) bool { // (correct for the seed-GI skip-sync gate) but FILE_THIN must NOT // get discard-zeroes-if-aligned=yes. // -// - rs-discard-granularity is set whenever the backing block device -// reports a non-zero discard granularity (lsblk DISC-GRAN > 0), -// INDEPENDENT of provider kind. This is the safe & correct gate: -// a non-zero DISC-GRAN means the device supports discard, so DRBD -// can issue an aligned UNMAP for an all-zero region during resync -// instead of writing zeros. (`discard-zeroes-if-aligned` separately -// governs whether DRBD may TREAT an aligned all-zero range as a -// discard on the SyncTarget; with it `no`, DRBD still benefits from -// the granularity on the SyncSource side.) A device reporting 0 +// - rs-discard-granularity is set ONLY for the discard-zero-safe +// provider set AND when the backing block device reports a non-zero +// discard granularity (lsblk DISC-GRAN > 0). A device reporting 0 // does not support discard — emitting a non-zero granularity there // would make drbdsetup reject the option, so we OMIT the key and -// DRBD falls back to a full byte-copy resync. +// DRBD falls back to a full byte-copy resync. FILE_THIN is excluded +// even though its loop backing reports a non-zero DISC-GRAN (see the +// inline mkfs/loop wedge rationale below). // // When in doubt the map omits the granularity key (full transfer), // never the reverse. The discard-zeroes flag is ALWAYS present (yes or @@ -240,6 +240,41 @@ func autoDiskOptions( diskOptDiscardZeroesAligned: zeroesVal, } + // rs-discard-granularity is emitted ONLY for a discard-zero-safe + // provider kind — i.e. a real block device whose aligned discard + // deterministically reads back zero (LVM_THIN / ZFS / ZFS_THIN). + // + // WHY this is gated on the SAME provider set as discard-zeroes (and + // NOT, as an earlier revision did, on the device's reported + // DISC-GRAN alone): a loop-backed FILE_THIN volume reports a non-zero + // lsblk DISC-GRAN (4096 on the stand) yet rendering + // rs-discard-granularity into its `disk { }` block REGRESSES the + // fresh-create convergence. When the elected day0 winner force- + // primaries to run `mkfs` (the FileSystem/Type path), mkfs issues a + // full-device discard; on the loop backing — which has a documented + // DRBD kernel write-path interaction (see tests/e2e/lib.sh) — that + // discard storm, with rs-discard-granularity active, dirties the + // bitmap relative to the day0-seeded peers and forces a FULL initial + // SyncTarget that then wedges (PausedSyncT dependency between the two + // targets). Proven on the dev stand + CI: an identical 3-replica + // create with FileSystem/Type=ext4 CONVERGES INSTANTLY on LVM_THIN + // (real block device, same full disk block) but FULL-resyncs and + // wedges on FILE_THIN (loop). The e2e `respawn-standalone-wedge` + // scenario (FILE_THIN + mkfs) failed for exactly this reason; the + // `replica-add-no-resync` scenario (FILE_THIN, NO mkfs) passed, so + // the trigger is the mkfs-discard × loop × rs-discard-granularity + // interaction, not the option in isolation. + // + // For FILE_THIN we therefore render only `discard-zeroes-if-aligned + // no` (DRBD's inert default) and OMIT the granularity — DRBD then + // does a full byte-copy resync on the rare loop relocate, which is + // always correct, while the day0 skip + mkfs convergence is + // preserved. The genuine thin-aware-resync win is retained where it + // is both safe AND effective: the real block-device thin/ZFS pools. + if !discardZeroesIfAligned(providerKind) { + return out + } + // Only attempt the granularity when we have a real device path to // probe. A diskless local replica (devicePath == "") has no // backing device — skip the granularity probe. diff --git a/pkg/satellite/discardgran_test.go b/pkg/satellite/discardgran_test.go index eafb68fc..86c40dad 100644 --- a/pkg/satellite/discardgran_test.go +++ b/pkg/satellite/discardgran_test.go @@ -182,12 +182,16 @@ func TestAutoDiskOptions_ProbeFailureOmitsGranularity(t *testing.T) { } } -// TestAutoDiskOptions_ThickNoZeroesButGranularity: thick LVM is NOT -// discard-zero safe → discard-zeroes-if-aligned=no, but the granularity -// is gated INDEPENDENTLY on the backing device's reported DISC-GRAN. A -// thick LV whose device supports discard still gets rs-discard-granularity -// so an aligned all-zero region can be UNMAPped during resync. -func TestAutoDiskOptions_ThickNoZeroesButGranularity(t *testing.T) { +// TestAutoDiskOptions_ThickNoZeroesNoGranularity: thick LVM is NOT +// discard-zero safe → discard-zeroes-if-aligned=no, AND it gets NO +// rs-discard-granularity. rs-discard-granularity is gated on the +// discard-zero-safe provider set (LVM_THIN / ZFS / ZFS_THIN); a thick +// LV is excluded even when its device reports a non-zero DISC-GRAN, +// because an aligned discard on thick storage does NOT deterministically +// read back zero — UNMAPping a region there during resync could expose +// stale bytes as zeros (and it is also part of the loop/FILE_THIN +// fresh-create regression class this gate closes). +func TestAutoDiskOptions_ThickNoZeroesNoGranularity(t *testing.T) { fx := storage.NewFakeExec() discGranResponse(fx, "/dev/vg/pvc-1_00000", "65536") @@ -197,21 +201,36 @@ func TestAutoDiskOptions_ThickNoZeroesButGranularity(t *testing.T) { t.Errorf("discard-zeroes-if-aligned = %q, want no (thick)", opts[diskOptDiscardZeroesAligned]) } - if opts[diskOptRsDiscardGranularity] != "65536" { - t.Errorf("rs-discard-granularity = %q, want 65536", opts[diskOptRsDiscardGranularity]) + if _, present := opts[diskOptRsDiscardGranularity]; present { + t.Errorf("rs-discard-granularity present (%q) for thick LVM; want omitted", opts[diskOptRsDiscardGranularity]) } } -// TestAutoDiskOptions_FileThin4KLoop pins the Q3 corner-case: a -// partially-written FILE_THIN volume on a 4 KiB-discard loop device. -// discard-zeroes-if-aligned is `no` (loop-backed sparse file, NOT -// zero-discard safe), but rs-discard-granularity MUST be emitted (4096) -// because the backing loop device reports a non-zero discard -// granularity — DECOUPLED from the discard-zeroes gate. Without this -// blockstor resynced ~2x the bytes upstream did. Target render matches -// upstream 1.33.2: `discard-zeroes-if-aligned no; rs-discard-granularity -// 4096;`. -func TestAutoDiskOptions_FileThin4KLoop(t *testing.T) { +// TestAutoDiskOptions_FileThin4KLoop_NoGranularity is the Q3-REGRESSION +// pin: a FILE_THIN volume on a 4 KiB-discard loop device gets +// discard-zeroes-if-aligned=no (loop-backed sparse file, NOT zero-discard +// safe) AND — critically — NO rs-discard-granularity, EVEN THOUGH the +// loop device reports DISC-GRAN=4096. +// +// Why the granularity MUST be omitted here (the regression this fixes): +// rendering rs-discard-granularity into a loop-backed FILE_THIN volume's +// disk block breaks fresh-create convergence. When the elected day0 +// winner force-primaries to run mkfs (the FileSystem/Type path), mkfs +// issues a full-device discard; on the loop backing — which has a +// documented DRBD kernel write-path interaction — that discard storm, +// with rs-discard-granularity active, dirties the bitmap relative to the +// day0-seeded peers and forces a FULL initial SyncTarget that then wedges. +// Proven on stand+CI: an identical 3-replica create + mkfs CONVERGES +// INSTANTLY on LVM_THIN (real block device, same disk block) but +// FULL-resyncs and wedges on FILE_THIN (loop). The e2e +// respawn-standalone-wedge scenario failed for exactly this reason. +// +// So rs-discard-granularity is gated on the discard-zero-safe provider +// set (LVM_THIN / ZFS / ZFS_THIN); FILE_THIN is excluded and renders only +// the inert `discard-zeroes-if-aligned no` — exactly what main rendered +// before the thin-aware-resync feature (a no-op disk option), preserving +// the day0 skip + mkfs convergence. +func TestAutoDiskOptions_FileThin4KLoop_NoGranularity(t *testing.T) { fx := storage.NewFakeExec() discGranResponse(fx, "/var/lib/blockstor/pvc-1.img", "4096") @@ -221,16 +240,27 @@ func TestAutoDiskOptions_FileThin4KLoop(t *testing.T) { t.Errorf("discard-zeroes-if-aligned = %q, want no (FILE_THIN)", opts[diskOptDiscardZeroesAligned]) } - if opts[diskOptRsDiscardGranularity] != "4096" { - t.Errorf("rs-discard-granularity = %q, want 4096 (decoupled from discard-zeroes)", opts[diskOptRsDiscardGranularity]) + if _, present := opts[diskOptRsDiscardGranularity]; present { + t.Errorf("rs-discard-granularity present (%q) for loop-backed FILE_THIN; want OMITTED "+ + "(breaks fresh-create+mkfs convergence — see respawn-standalone-wedge regression)", + opts[diskOptRsDiscardGranularity]) + } + + // FILE_THIN must NOT probe lsblk at all for the granularity — the + // provider-kind gate short-circuits before the probe. (The probe is + // still harmless, but pinning no-probe documents the gate order.) + for _, cl := range fx.CommandLines() { + if strings.Contains(cl, "lsblk") { + t.Errorf("FILE_THIN probed lsblk (%q); the provider gate should short-circuit before the probe", cl) + } } } // TestAutoDiskOptions_FileThinZeroGranNothingUseful: a FILE_THIN volume -// whose backing device reports DISC-GRAN=0 (no discard support) gets -// discard-zeroes-if-aligned=no and NO granularity — DRBD falls back to a -// safe full resync. The disk block still renders the flag (matching -// upstream's explicit `no`). +// whose backing device reports DISC-GRAN=0 still gets only +// discard-zeroes-if-aligned=no and NO granularity. (FILE_THIN is gated +// out of the granularity regardless of DISC-GRAN; this case just confirms +// the rendered flag is the inert `no`.) func TestAutoDiskOptions_FileThinZeroGranNothingUseful(t *testing.T) { fx := storage.NewFakeExec() discGranResponse(fx, "/var/lib/blockstor/pvc-1.img", "0") @@ -242,7 +272,7 @@ func TestAutoDiskOptions_FileThinZeroGranNothingUseful(t *testing.T) { } if _, present := opts[diskOptRsDiscardGranularity]; present { - t.Errorf("rs-discard-granularity present (%q) for DISC-GRAN=0; want omitted", opts[diskOptRsDiscardGranularity]) + t.Errorf("rs-discard-granularity present (%q) for FILE_THIN; want omitted", opts[diskOptRsDiscardGranularity]) } } @@ -302,8 +332,9 @@ func TestAutoDiskOptionsForResource_ZfsThin(t *testing.T) { } // TestAutoDiskOptionsForResource_ThickNoZeroes: a thick LVM pool yields -// discard-zeroes-if-aligned=no but still picks up rs-discard-granularity -// from the backing device (independent gate). +// discard-zeroes-if-aligned=no and NO rs-discard-granularity — the +// granularity is gated on the discard-zero-safe provider set, which +// excludes thick LVM. func TestAutoDiskOptionsForResource_ThickNoZeroes(t *testing.T) { fx := storage.NewFakeExec() device := "/dev/vg/pvc-1_00000" @@ -317,15 +348,22 @@ func TestAutoDiskOptionsForResource_ThickNoZeroes(t *testing.T) { opts := rec.autoDiskOptionsForResource(context.Background(), drFor("pvc-1", "thick1"), map[int32]string{0: device}) - if opts[diskOptDiscardZeroesAligned] != "no" || opts[diskOptRsDiscardGranularity] != "65536" { - t.Fatalf("thick LVM resource = %v; want no + 65536", opts) + if opts[diskOptDiscardZeroesAligned] != "no" { + t.Fatalf("thick LVM resource = %v; want discard-zeroes-if-aligned=no", opts) + } + + if _, present := opts[diskOptRsDiscardGranularity]; present { + t.Fatalf("thick LVM resource carries rs-discard-granularity (%v); want omitted", opts) } } -// TestAutoDiskOptionsForResource_FileThin4KLoop: the Q3 corner-case at -// the resource level — FILE_THIN on a 4 KiB-discard loop yields -// discard-zeroes-if-aligned=no + rs-discard-granularity=4096, so a -// partially-written volume resyncs only the written bytes. +// TestAutoDiskOptionsForResource_FileThin4KLoop is the Q3-REGRESSION pin +// at the resource level: FILE_THIN on a 4 KiB-discard loop yields ONLY +// discard-zeroes-if-aligned=no and NO rs-discard-granularity. Rendering +// the granularity on a loop-backed FILE_THIN volume regressed fresh- +// create convergence on the mkfs/force-primary path (the e2e +// respawn-standalone-wedge full-resync wedge); see +// TestAutoDiskOptions_FileThin4KLoop_NoGranularity for the full rationale. func TestAutoDiskOptionsForResource_FileThin4KLoop(t *testing.T) { fx := storage.NewFakeExec() device := "/var/lib/blockstor/pvc-1.img" @@ -339,8 +377,13 @@ func TestAutoDiskOptionsForResource_FileThin4KLoop(t *testing.T) { opts := rec.autoDiskOptionsForResource(context.Background(), drFor("pvc-1", "file1"), map[int32]string{0: device}) - if opts[diskOptDiscardZeroesAligned] != "no" || opts[diskOptRsDiscardGranularity] != "4096" { - t.Fatalf("FILE_THIN resource = %v; want no + 4096", opts) + if opts[diskOptDiscardZeroesAligned] != "no" { + t.Fatalf("FILE_THIN resource = %v; want discard-zeroes-if-aligned=no", opts) + } + + if _, present := opts[diskOptRsDiscardGranularity]; present { + t.Fatalf("FILE_THIN resource carries rs-discard-granularity (%v); want OMITTED "+ + "(loop+mkfs fresh-create wedge regression)", opts) } } @@ -446,11 +489,34 @@ func TestBuildResFile_RendersDiscardDiskBlock(t *testing.T) { } } -// TestBuildResFile_RendersFileThinNoZeroesBlock pins the Q3 rendered -// .res: a FILE_THIN-on-4K-loop volume renders a disk block with -// `discard-zeroes-if-aligned no;` AND `rs-discard-granularity 4096;` — -// matching upstream 1.33.2's render for the same backing. -func TestBuildResFile_RendersFileThinNoZeroesBlock(t *testing.T) { +// TestBuildResFile_FileThinDay0SkipRender is the END-TO-END Q3-REGRESSION +// pin: a FILE_THIN-on-4K-loop volume, driven through the REAL +// autoDiskOptionsForResource gate (not a hand-built map), renders a +// `.res` whose disk block carries `discard-zeroes-if-aligned no;` but +// NEVER `rs-discard-granularity`. This is the render that preserves +// blockstor's day0 GI-skip + mkfs/force-primary convergence on loop- +// backed FILE_THIN. +// +// REGRESSION CONTEXT: a prior revision rendered `rs-discard-granularity +// 4096;` into this exact block (the loop reports DISC-GRAN=4096). On a +// fresh 3-replica create whose elected winner force-primaries to run +// mkfs, that option turned the day0-skip into a FULL initial SyncTarget +// that wedged — the e2e respawn-standalone-wedge failure. The same disk +// block is INERT on a real block device (LVM_THIN converges instantly), +// so the granularity is gated to the discard-zero-safe block-device +// providers and OMITTED for loop-backed FILE_THIN. This test renders the +// .res through the production path so a re-decoupling that re-introduces +// the granularity fails here at L1. +func TestBuildResFile_FileThinDay0SkipRender(t *testing.T) { + fx := storage.NewFakeExec() + device := "/var/lib/blockstor/pvc-ft.img" + discGranResponse(fx, device, "4096") + prov := file.NewProvider(file.Config{Dir: t.TempDir(), Thin: true}, fx) + rec := NewReconciler(ReconcilerConfig{ + Providers: map[string]storage.Provider{"file1": prov}, + Exec: fx, + }) + dr := &intent.DesiredResource{ Name: "pvc-ft", NodeName: "n1", @@ -463,27 +529,67 @@ func TestBuildResFile_RendersFileThinNoZeroesBlock(t *testing.T) { "peer.n2.address": "10.0.0.2", "peer.n2.node-id": "1", "peer.n2.port": "7000", }, } - devices := map[int32]string{0: "/var/lib/blockstor/pvc-ft.img"} - autoDisk := map[string]string{ - diskOptDiscardZeroesAligned: "no", - diskOptRsDiscardGranularity: "4096", - } + devices := map[int32]string{0: device} + + autoDisk := rec.autoDiskOptionsForResource(context.Background(), dr, devices) body, err := buildResFile(dr, "n1", "10.0.0.1", devices, autoDisk) if err != nil { t.Fatalf("buildResFile: %v", err) } - if !strings.Contains(body, "disk {") { - t.Fatalf("no disk block in:\n%s", body) + if !strings.Contains(body, "discard-zeroes-if-aligned no;") { + t.Errorf("missing discard-zeroes-if-aligned no; in FILE_THIN render:\n%s", body) } - if !strings.Contains(body, "discard-zeroes-if-aligned no;") { - t.Errorf("missing discard-zeroes-if-aligned no; in:\n%s", body) + if strings.Contains(body, "rs-discard-granularity") { + t.Errorf("FILE_THIN render carries rs-discard-granularity — day0-skip/mkfs convergence "+ + "REGRESSION (respawn-standalone-wedge):\n%s", body) + } +} + +// TestBuildResFile_LvmThinDay0SkipRender is the positive counterpart: +// a REAL block-device thin provider (LVM_THIN) DOES get +// `rs-discard-granularity` rendered (the genuine thin-aware-resync win is +// retained where it is safe + effective). Together with the FILE_THIN +// pin above this nails the provider-gate boundary at L1. +func TestBuildResFile_LvmThinDay0SkipRender(t *testing.T) { + fx := storage.NewFakeExec() + device := "/dev/vg/pvc-lt_00000" + discGranResponse(fx, device, "65536") + prov := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "pool"}, fx) + rec := NewReconciler(ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": prov}, + Exec: fx, + }) + + dr := &intent.DesiredResource{ + Name: "pvc-lt", + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n2"}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + "peer.n2.address": "10.0.0.2", "peer.n2.node-id": "1", "peer.n2.port": "7000", + }, + } + devices := map[int32]string{0: device} + + autoDisk := rec.autoDiskOptionsForResource(context.Background(), dr, devices) + + body, err := buildResFile(dr, "n1", "10.0.0.1", devices, autoDisk) + if err != nil { + t.Fatalf("buildResFile: %v", err) + } + + if !strings.Contains(body, "discard-zeroes-if-aligned yes;") { + t.Errorf("missing discard-zeroes-if-aligned yes; in LVM_THIN render:\n%s", body) } - if !strings.Contains(body, "rs-discard-granularity 4096;") { - t.Errorf("missing rs-discard-granularity 4096; in:\n%s", body) + if !strings.Contains(body, "rs-discard-granularity 65536;") { + t.Errorf("LVM_THIN render missing rs-discard-granularity (thin-aware-resync win lost):\n%s", body) } } diff --git a/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh b/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh index 7bd45450..22dd9185 100755 --- a/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh +++ b/tests/e2e/cli-matrix/file-thin-rs-discard-granularity.sh @@ -2,26 +2,36 @@ # # usage: file-thin-rs-discard-granularity.sh WORK_DIR # -# L6 cli-matrix cell — corner-case Q3. +# L6 cli-matrix cell — corner-case Q3 (FILE_THIN disk-block render) + +# its fresh-create convergence REGRESSION guard. # -# Reproduction: a 512 MiB FILE_THIN resource, only ~320 MiB of it -# written, gains a 3rd replica. The new replica's resync MUST transfer -# only the written bytes (~320 MiB), NOT the whole 512 MiB device — -# matching upstream LINSTOR 1.33.2 on the same loop backing. +# Background: the thin-aware-resync feature renders DRBD's +# `discard-zeroes-if-aligned` + `rs-discard-granularity` into the disk +# block so a partially-written THIN volume resyncs ~only the written +# bytes. That win is gated to the discard-zero-safe BLOCK-device provider +# kinds (LVM_THIN / ZFS / ZFS_THIN). # -# Pre-fix blockstor rendered NO `disk { }` block for FILE_THIN because -# the satellite coupled rs-discard-granularity to the -# discard-zeroes-if-aligned provider gate (which correctly excludes -# FILE_THIN). Without rs-discard-granularity DRBD cannot UNMAP the -# all-zero unwritten ranges during resync, so it copies the whole device -# (~2x the bytes; measured 524012 KiB vs upstream's 327680 KiB). +# A FILE_THIN volume is loop-backed: it reports a non-zero lsblk +# DISC-GRAN (4096) but rendering `rs-discard-granularity` into its disk +# block REGRESSED fresh-create convergence. When the elected day0 winner +# force-primaries to run mkfs (the FileSystem/Type path), mkfs issues a +# full-device discard; on the loop backing that discard storm, with +# rs-discard-granularity active, dirtied the bitmap relative to the +# day0-seeded peers and forced a FULL initial SyncTarget that then wedged +# (the e2e respawn-standalone-wedge failure). The identical create +# converges instantly on LVM_THIN (real block device, same disk block) — +# so the option is gated OUT of loop-backed FILE_THIN. # -# Contract — assert all three legs: -# 1. the rendered .res on the diskful node carries a `disk { }` block -# with `rs-discard-granularity` and `discard-zeroes-if-aligned no`; -# 2. the new replica converges to UpToDate (clean sync); -# 3. the resync TARGET's `received` byte counter is close to the -# written bytes, NOT to the full device size (the core win). +# Contract — assert all three legs on FILE_THIN (`stand` pool): +# 1. the rendered .res carries `discard-zeroes-if-aligned no;` +# (upstream-parity provider flag) but NO `rs-discard-granularity` +# (the regression-causing key); +# 2. a fresh resource whose RD carries FileSystem/Type=ext4 (drives the +# winner force-primary + mkfs path) converges WITHOUT a full initial +# resync — i.e. the day0 skip holds (peers reach UpToDate, the +# sync-target `received` byte counter stays ~0, not the device size); +# 3. a 2nd diskful replica added to the *fresh* RD also converges via +# the day0 skip (no full resync of the empty device). set -euo pipefail @@ -39,14 +49,12 @@ linstor_cli_setup RD=cli-matrix-q3 SP=stand -# 512 MiB volume, write 320 MiB, expect resync ≈ written. VOL_MIB=512 -WRITTEN_MIB=320 -# Upper bound for "received" on the sync target: the written bytes plus -# generous slack for DRBD bitmap/activity-log overhead and rounding. -# A full-device copy would be ~524288 KiB, which this comfortably -# excludes; the pre-fix bug measured 524012 KiB. -MAX_RECEIVED_KIB=$(( (WRITTEN_MIB + 96) * 1024 )) +# The day0 skip means the freshly-added empty replica should transfer +# essentially nothing. Allow generous slack for DRBD bitmap/AL metadata +# overhead, but FAR below the full device (~524288 KiB) — a full resync +# (the regression) would blow past this. +MAX_RECEIVED_KIB=$(( 64 * 1024 )) cleanup() { delete_rd "$RD" @@ -58,58 +66,40 @@ trap cleanup EXIT N1=$WORKER_1 N2=$WORKER_2 -echo ">> [Q3] 512M FILE_THIN single diskful replica on $N1" +echo ">> [Q3] 512M FILE_THIN, FileSystem/Type=ext4 (drives force-primary mkfs), diskful on $N1" "${LCTL[@]}" resource-definition create "$RD" >/dev/null +"${LCTL[@]}" resource-definition set-property "$RD" FileSystem/Type ext4 >/dev/null "${LCTL[@]}" volume-definition create "$RD" "${VOL_MIB}M" >/dev/null "${LCTL[@]}" resource create "$N1" "$RD" --storage-pool="$SP" >/dev/null echo ">> wait $N1 UpToDate" wait_disk_state "$RD" "$N1" UpToDate 120 -echo ">> assert rendered .res on $N1 carries the discard disk block" +echo ">> assert rendered .res on $N1: discard-zeroes no, NO rs-discard-granularity" RES=$(on_node "$N1" cat "/etc/drbd.d/${RD}.res") -if ! grep -q "disk {" <<<"$RES"; then - echo "FAIL: no disk { } block in rendered .res for FILE_THIN:" >&2 - echo "$RES" >&2 - exit 1 -fi -if ! grep -Eq "rs-discard-granularity[[:space:]]+[0-9]+;" <<<"$RES"; then - echo "FAIL: rendered .res lacks rs-discard-granularity:" >&2 - echo "$RES" >&2 - exit 1 -fi if ! grep -Eq "discard-zeroes-if-aligned[[:space:]]+no;" <<<"$RES"; then echo "FAIL: rendered .res lacks discard-zeroes-if-aligned no (FILE_THIN):" >&2 echo "$RES" >&2 exit 1 fi -echo " OK: $(grep -E 'rs-discard-granularity|discard-zeroes-if-aligned' <<<"$RES" | tr -s ' \t' ' ')" - -echo ">> write ${WRITTEN_MIB} MiB pattern into the volume on $N1" -# /dev/drbd is the device; resolve the minor from the .res. -MINOR=$(grep -Eo 'minor[[:space:]]+[0-9]+' <<<"$RES" | head -1 | grep -Eo '[0-9]+') -if [[ -z "$MINOR" ]]; then - echo "FAIL: could not resolve DRBD minor from .res" >&2 +if grep -Eq "rs-discard-granularity" <<<"$RES"; then + echo "FAIL: rendered .res carries rs-discard-granularity for loop-backed FILE_THIN" >&2 + echo " => fresh-create + mkfs day0-skip regression (respawn-standalone-wedge)" >&2 + echo "$RES" >&2 exit 1 fi -on_node "$N1" dd if=/dev/urandom "of=/dev/drbd${MINOR}" bs=1M count="$WRITTEN_MIB" \ - oflag=direct conv=fsync status=none -echo " wrote ${WRITTEN_MIB} MiB to /dev/drbd${MINOR}" +echo " OK: $(grep -E 'discard-zeroes-if-aligned' <<<"$RES" | tr -s ' \t' ' '); no rs-discard-granularity" -echo ">> add 3rd... 2nd diskful replica on $N2 (triggers resync)" +echo ">> add 2nd diskful replica on $N2 (must day0-skip, NOT full-resync the empty device)" "${LCTL[@]}" resource create "$N2" "$RD" --storage-pool="$SP" >/dev/null -echo ">> wait $N2 UpToDate (clean sync)" +echo ">> wait $N2 UpToDate (clean, no full resync)" wait_disk_state "$RD" "$N2" UpToDate 300 -echo ">> read resync-received byte counter on the sync target ($N2)" -# `drbdsetup status --statistics` reports per-peer-device `received:` -# in bytes (total received over the connection — dominated here by the -# initial resync). Sum across peer devices to be robust. +echo ">> read resync-received byte counter on $N2 (day0 skip => ~0)" RECEIVED_BYTES=$(on_node "$N2" drbdsetup status "$RD" --statistics --json 2>/dev/null \ | jq '[.[].connections[].peer_devices[].received // 0] | add' 2>/dev/null || echo "") if [[ -z "$RECEIVED_BYTES" || "$RECEIVED_BYTES" == "null" ]]; then - # Text fallback: parse `received:` from drbdsetup status. RECEIVED_BYTES=$(on_node "$N2" drbdsetup status "$RD" --statistics 2>/dev/null \ | grep -oE 'received:[0-9]+' | head -1 | cut -d: -f2) fi @@ -119,13 +109,14 @@ if [[ -z "$RECEIVED_BYTES" ]]; then fi RECEIVED_KIB=$(( RECEIVED_BYTES / 1024 )) -echo " sync target received: ${RECEIVED_KIB} KiB (written ${WRITTEN_MIB} MiB," -echo " device ${VOL_MIB} MiB; pre-fix bug copied the whole device)" +echo " sync target received: ${RECEIVED_KIB} KiB (device ${VOL_MIB} MiB;" +echo " a full resync — the regression — would be ~$(( VOL_MIB * 1024 )) KiB)" if (( RECEIVED_KIB > MAX_RECEIVED_KIB )); then - echo "FAIL: sync target received ${RECEIVED_KIB} KiB > ${MAX_RECEIVED_KIB} KiB" >&2 - echo " => DRBD copied the unwritten zero ranges (rs-discard-granularity not honoured)" >&2 + echo "FAIL: $N2 received ${RECEIVED_KIB} KiB > ${MAX_RECEIVED_KIB} KiB" >&2 + echo " => the empty replica FULL-resynced (day0 skip broke)" >&2 exit 1 fi -echo "PASS: Q3 — FILE_THIN resync transferred only the written bytes" +echo "PASS: Q3 — FILE_THIN renders no rs-discard-granularity and the fresh-create" +echo " + mkfs day0 skip holds (no full resync of the empty device)" diff --git a/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml b/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml index f9d45f81..c24438e6 100644 --- a/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml +++ b/tests/operator-harness/replay/file-thin-rs-discard-granularity.yaml @@ -1,24 +1,30 @@ name: file-thin-rs-discard-granularity description: | - Corner-case Q3: a FILE_THIN resource on a 4 KiB-discard loop backing - MUST render `rs-discard-granularity` into its DRBD disk options so a - partially-written volume resyncs only the written bytes (upstream - parity), NOT the whole device. + Corner-case Q3 (FILE_THIN disk-block render) + its fresh-create + convergence REGRESSION guard, at the operator-CLI level. - Pre-fix blockstor coupled rs-discard-granularity to the - discard-zeroes-if-aligned provider gate and so emitted NO disk block at - all for FILE_THIN — a 3rd-replica add resynced ~2x the bytes upstream - LINSTOR did. This workflow asserts, at the operator-CLI level, that: + The thin-aware-resync feature renders `discard-zeroes-if-aligned` + + `rs-discard-granularity` into the DRBD disk block so a partially-written + THIN volume resyncs only the written bytes. That win is gated to the + discard-zero-safe BLOCK-device providers (LVM_THIN / ZFS / ZFS_THIN). - 1. the rendered DRBD config on the diskful node carries - rs-discard-granularity (drbd_option, live `drbdsetup show`); - 2. discard-zeroes-if-aligned is `no` for FILE_THIN (provider gate - unchanged); and - 3. adding a 3rd replica converges to a clean UpToDate sync. + A loop-backed FILE_THIN volume reports a non-zero lsblk DISC-GRAN (4096) + but rendering `rs-discard-granularity` there REGRESSED fresh-create + convergence: when the elected day0 winner force-primaries to run mkfs + (the FileSystem/Type path), mkfs issues a full-device discard; on the + loop backing that discard storm, with rs-discard-granularity active, + dirtied the bitmap relative to the day0-seeded peers and forced a FULL + initial SyncTarget that wedged (the e2e respawn-standalone-wedge + failure). The identical create converges instantly on LVM_THIN, so the + option is gated OUT of loop-backed FILE_THIN. - The byte-count win itself is measured by the L6 cli-matrix cell - `file-thin-rs-discard-granularity.sh` (it dd's a partial pattern and - compares received-KiB before vs after the fix). + This workflow asserts, at the operator-CLI level, that on FILE_THIN: + + 1. discard-zeroes-if-aligned is `no` (provider gate unchanged); + 2. rs-discard-granularity is ABSENT from the live DRBD config + (drbdsetup show); and + 3. a 2nd diskful replica added to the fresh RD converges cleanly via + the day0 skip (sync_clean — no full resync of the empty device). prerequisites: min_nodes: 3 @@ -31,6 +37,9 @@ steps: - name: create-rd cmd: ["resource-definition", "create", "{{rd}}"] expect_exit: 0 + - name: set-filesystem-type + cmd: ["resource-definition", "set-property", "{{rd}}", "FileSystem/Type", "ext4"] + expect_exit: 0 - name: create-vd cmd: ["volume-definition", "create", "{{rd}}", "512M"] expect_exit: 0 @@ -43,25 +52,29 @@ steps: node: "{{node1}}" expected: UpToDate timeout_s: 120 - - name: assert-rs-discard-granularity-rendered + - name: assert-discard-zeroes-no cmd: ["resource", "list", "--resources", "{{rd}}"] expect_exit: 0 await: kind: drbd_option rd: "{{rd}}" node: "{{node1}}" - key: "rs-discard-granularity" - expected: "4096" + key: "discard-zeroes-if-aligned" + expected: "no" timeout_s: 60 - - name: assert-discard-zeroes-no + - name: assert-rs-discard-granularity-absent cmd: ["resource", "list", "--resources", "{{rd}}"] expect_exit: 0 await: + # rs-discard-granularity must NOT appear in `drbdsetup show` for a + # loop-backed FILE_THIN volume — its absence makes `actual` empty, + # matching expected "". A re-decoupling that re-introduces it would + # make actual=4096 != "" and fail here. kind: drbd_option rd: "{{rd}}" node: "{{node1}}" - key: "discard-zeroes-if-aligned" - expected: "no" + key: "rs-discard-granularity" + expected: "" timeout_s: 60 - name: r-c-diskful-node2 cmd: ["resource", "create", "{{node2}}", "{{rd}}", "--storage-pool", "{{sp}}"] From 640b0fa74ea366dc90c776c7067fee86be29d3e4 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sat, 6 Jun 2026 02:23:55 +0200 Subject: [PATCH 5/5] docs(satellite): correct autoDiskOptionsForResource godoc for FILE_THIN gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comment-only: the rs-discard-granularity gate is no longer INDEPENDENT of provider kind — it follows the discard-zero-safe block-device set and excludes loop-backed FILE_THIN. Align the godoc with the implementation. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/satellite/discardgran.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/satellite/discardgran.go b/pkg/satellite/discardgran.go index b23b87b0..7b2aad47 100644 --- a/pkg/satellite/discardgran.go +++ b/pkg/satellite/discardgran.go @@ -75,8 +75,11 @@ const ( // // It probes each local diskful volume's backing device. Each volume // contributes a discard-zeroes-if-aligned flag (yes/no per provider -// kind) and, when its device reports a non-zero discard granularity, an -// rs-discard-granularity value — INDEPENDENT gates (see autoDiskOptions). +// kind) and, for the discard-zero-safe BLOCK-device provider set +// (LVM_THIN / ZFS / ZFS_THIN) whose device reports a non-zero discard +// granularity, an rs-discard-granularity value. Loop-backed FILE_THIN is +// excluded from the granularity (see autoDiskOptions for the loop+mkfs +// wedge rationale). // // The resource-scope collapse is CONSERVATIVE for the (today // hypothetical) multi-volume case (mergeResourceDiscardOptions):