Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion pkg/rest/autoplace.go
Original file line number Diff line number Diff line change
Expand Up @@ -2052,7 +2052,29 @@ func (s *Server) resolveTakeoverStorPool(ctx context.Context, rdName, takeoverNo
return "", errors.Wrapf(rgErr, "lookup RG %q for takeover pool resolution", rd.ResourceGroupName)
}

return rg.SelectFilter.StoragePool, nil
if pool := rg.SelectFilter.StoragePool; pool != "" {
return pool, nil
}

// Bug 364 (P1): also walk the RG's StoragePoolList default. The
// linstor-csi driver posts no body to
// `POST /v1/resource-definitions/{rd}/resources/{node}` and
// relies on RG-side propagation for the pool name; when the
// SC sets `linstor.csi.linbit.com/storagePool: <p>`,
// linstor-csi's RGCreate path lands the value under
// SelectFilter.StoragePoolList[0] (not .StoragePool). Pre-fix the
// fresh-create resolution only checked SelectFilter.StoragePool,
// so an `r c <node> <rd>` against such an RG persisted a Resource
// with empty Props["StorPoolName"] — the satellite reconciler
// then had no pool to bind to and the replica wedged at
// `Provisioning` forever. Matches the existing
// `resolveGatePoolName` walk shape (the per-pool capacity gate
// already tolerates StoragePoolList tier-4 for the same reason).
if len(rg.SelectFilter.StoragePoolList) > 0 {
return rg.SelectFilter.StoragePoolList[0], nil
}

return "", nil
}

// resolveStorPoolForFreshCreate implements the Bug 327 fix for the
Expand Down
242 changes: 242 additions & 0 deletions pkg/rest/bug_364_resource_create_pool_resolve_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
// SPDX-License-Identifier: Apache-2.0

/*
Copyright 2026 Cozystack contributors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package rest

import (
"net/http"
"testing"

apiv1 "github.com/cozystack/blockstor/pkg/api/v1"
"github.com/cozystack/blockstor/pkg/store"
)

// Bug 364 (P1, hunt-caught 2026-06-02): `linstor r c <node> <rd>`
// without `--storage-pool` against an RG that pins its default via
// `select_filter.storage_pool_list` (not `select_filter.storage_pool`)
// created a Resource with empty `Props["StorPoolName"]`. The satellite
// reconciler then had no pool to bind to and the replica wedged at
// "Provisioning" — visible to the operator only as a phantom replica
// that never reached UpToDate.
//
// Bug-hunt v7 (2026-06-02) reproduced on a live dev stand:
//
// $ curl -sS -X PUT .../v1/resource-groups/testrg \
// -d '{"select_filter":{"storage_pool":"","storage_pool_list":["lvm-thin"]}}'
// 200
// $ linstor r c dev-worker-1 testlist
// SUCCESS: resource(s) created on resource-definition: testlist
// $ curl -sS .../v1/resource-definitions/testlist/resources \
// | jq '.[].props'
// null # <-- empty: StorPoolName missing
//
// linstor-csi posts no body to the per-node resource-create endpoint
// and relies on RG-side propagation for the pool name. When the
// StorageClass sets `linstor.csi.linbit.com/storagePool: <p>`,
// linstor-csi's RGCreate path lands the value under
// SelectFilter.StoragePoolList[0] (not .StoragePool), so this is the
// canonical CSI shape — every Cozystack volume hits this path.
//
// The fix extends `resolveTakeoverStorPool` to also walk the RG's
// `StoragePoolList[0]` (mirroring the existing `resolveGatePoolName`
// fallback chain — the per-pool capacity gate already tolerates that
// tier). 1 line of intent, 3 lines of fence.

// TestBug364ResourceCreatePicksUpStoragePoolList pins the canonical
// reproducer: an RG with only `storage_pool_list` (no
// `storage_pool`) must drive `resolveStorPoolForFreshCreate` to stamp
// the first entry onto `res.Props["StorPoolName"]`.
func TestBug364ResourceCreatePicksUpStoragePoolList(t *testing.T) {
t.Parallel()

st := store.NewInMemory()
ctx := t.Context()

if err := st.ResourceGroups().Create(ctx, &apiv1.ResourceGroup{
Name: "csirg",
SelectFilter: apiv1.AutoSelectFilter{
PlaceCount: 1,
StoragePoolList: []string{"lvm-thin"},
},
}); err != nil {
t.Fatalf("seed RG: %v", err)
}

if err := st.ResourceDefinitions().Create(ctx, &apiv1.ResourceDefinition{
Name: "rd1",
ResourceGroupName: "csirg",
}); err != nil {
t.Fatalf("seed RD: %v", err)
}

if err := st.Nodes().Create(ctx, &apiv1.Node{
Name: "n1",
Type: apiv1.NodeTypeSatellite,
NetInterfaces: []apiv1.NetInterface{
{Name: DefaultNetInterfaceName, Address: "10.0.0.1"},
},
}); err != nil {
t.Fatalf("seed Node: %v", err)
}

if err := st.StoragePools().Create(ctx, &apiv1.StoragePool{
NodeName: "n1",
StoragePoolName: "lvm-thin",
ProviderKind: apiv1.StoragePoolKindLVMThin,
}); err != nil {
t.Fatalf("seed SP: %v", err)
}

base, stop := startServerWithStore(t, st)
defer stop()

// linstor-csi shape: empty body, path-only intent.
resp := httpPost(t, base+"/v1/resource-definitions/rd1/resources/n1",
[]byte(""))
defer func() { _ = resp.Body.Close() }()

if resp.StatusCode != http.StatusCreated {
got, _ := readAllBody(resp)
t.Fatalf("status: got %d, want 201. Body: %s", resp.StatusCode, got)
}

res, err := st.Resources().Get(ctx, "rd1", "n1")
if err != nil {
t.Fatalf("re-fetch resource: %v", err)
}

if got, want := res.Props["StorPoolName"], "lvm-thin"; got != want {
t.Errorf("StorPoolName: got %q, want %q (Bug 364: storage_pool_list[0] must seed fresh-create)",
got, want)
}
}

// TestBug364ResolveTakeoverStorPoolFromList exercises the helper
// directly so a future refactor of the create-pipeline doesn't lose
// the tier-4 fallback semantics.
func TestBug364ResolveTakeoverStorPoolFromList(t *testing.T) {
t.Parallel()

st := store.NewInMemory()
ctx := t.Context()

if err := st.ResourceGroups().Create(ctx, &apiv1.ResourceGroup{
Name: "rg",
SelectFilter: apiv1.AutoSelectFilter{
StoragePoolList: []string{"pool-a", "pool-b"},
},
}); err != nil {
t.Fatalf("seed RG: %v", err)
}

if err := st.ResourceDefinitions().Create(ctx, &apiv1.ResourceDefinition{
Name: "rd",
ResourceGroupName: "rg",
}); err != nil {
t.Fatalf("seed RD: %v", err)
}

s := &Server{Store: st}

got, err := s.resolveTakeoverStorPool(ctx, "rd", "n1")
if err != nil {
t.Fatalf("resolveTakeoverStorPool: %v", err)
}

if want := "pool-a"; got != want {
t.Errorf("resolveTakeoverStorPool: got %q, want %q (must return StoragePoolList[0])",
got, want)
}
}

// TestBug364StoragePoolWinsOverList pins the precedence: when both
// SelectFilter.StoragePool and SelectFilter.StoragePoolList are set,
// the single .StoragePool takes priority (matches
// `resolveGatePoolName`'s tier ordering and upstream LINSTOR's
// CtrlRscCrtApiHelper).
func TestBug364StoragePoolWinsOverList(t *testing.T) {
t.Parallel()

st := store.NewInMemory()
ctx := t.Context()

if err := st.ResourceGroups().Create(ctx, &apiv1.ResourceGroup{
Name: "rg",
SelectFilter: apiv1.AutoSelectFilter{
StoragePool: "wins",
StoragePoolList: []string{"loses"},
},
}); err != nil {
t.Fatalf("seed RG: %v", err)
}

if err := st.ResourceDefinitions().Create(ctx, &apiv1.ResourceDefinition{
Name: "rd",
ResourceGroupName: "rg",
}); err != nil {
t.Fatalf("seed RD: %v", err)
}

s := &Server{Store: st}

got, err := s.resolveTakeoverStorPool(ctx, "rd", "n1")
if err != nil {
t.Fatalf("resolveTakeoverStorPool: %v", err)
}

if want := "wins"; got != want {
t.Errorf("resolveTakeoverStorPool: got %q, want %q (single .StoragePool must beat list)",
got, want)
}
}

// TestBug364NoListNoStoragePoolReturnsEmpty pins the "no pool
// anywhere" case — empty string with no error, so the downstream
// flow falls through to diskless / 404 paths as before.
func TestBug364NoListNoStoragePoolReturnsEmpty(t *testing.T) {
t.Parallel()

st := store.NewInMemory()
ctx := t.Context()

if err := st.ResourceGroups().Create(ctx, &apiv1.ResourceGroup{
Name: "rg",
SelectFilter: apiv1.AutoSelectFilter{},
}); err != nil {
t.Fatalf("seed RG: %v", err)
}

if err := st.ResourceDefinitions().Create(ctx, &apiv1.ResourceDefinition{
Name: "rd",
ResourceGroupName: "rg",
}); err != nil {
t.Fatalf("seed RD: %v", err)
}

s := &Server{Store: st}

got, err := s.resolveTakeoverStorPool(ctx, "rd", "n1")
if err != nil {
t.Fatalf("resolveTakeoverStorPool: %v", err)
}

if got != "" {
t.Errorf("resolveTakeoverStorPool: got %q, want %q (no pool config → empty)",
got, "")
}
}
121 changes: 121 additions & 0 deletions tests/e2e/resource-create-pool-resolve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env bash
#
# usage: resource-create-pool-resolve.sh WORK_DIR
#
# Bug 364 (P1, hunt-caught 2026-06-02) — `linstor r c <node> <rd>`
# without `--storage-pool` against an RG that pins its default via
# `select_filter.storage_pool_list` (not `select_filter.storage_pool`)
# created a Resource with empty `Props["StorPoolName"]`. The satellite
# reconciler then had no pool to bind to and the replica wedged at
# "Provisioning" — visible to the operator only as a phantom replica
# that never reached UpToDate.
#
# linstor-csi is the canonical caller for this path: it posts no body
# to the per-node resource-create endpoint and relies on RG-side
# propagation for the pool name. When the StorageClass sets
# `linstor.csi.linbit.com/storagePool: <p>`, linstor-csi's RGCreate
# path lands the value under SelectFilter.StoragePoolList[0] (not
# .StoragePool), so every Cozystack volume hits this path.
#
# This e2e pins the fix on a live cluster: an RG with only
# `storage_pool_list` must drive the resource-create's fresh-create
# pool resolution to stamp Props["StorPoolName"] from the list's
# first entry, and the satellite must actually reach UpToDate.

set -euo pipefail

WORK_DIR=${1:?work_dir required}
export KUBECONFIG="$WORK_DIR/kubeconfig"

SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=lib.sh
source "$SCRIPT_DIR/lib.sh"

require_workers 1

PF_PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("127.0.0.1", 0)); print(s.getsockname()[1]); s.close()')
kubectl -n "$NS" port-forward deploy/blockstor-apiserver "$PF_PORT":3370 \
>/tmp/bug364-pf.log 2>&1 &
PF_PID=$!

cleanup() {
kill "$PF_PID" 2>/dev/null || true
wait "$PF_PID" 2>/dev/null || true
curl -s -X DELETE "http://localhost:$PF_PORT/v1/resource-definitions/bug364-rd" >/dev/null 2>&1 || true
curl -s -X DELETE "http://localhost:$PF_PORT/v1/resource-groups/bug364-rg" >/dev/null 2>&1 || true
}
Comment on lines +41 to +46
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In the cleanup function, kill "$PF_PID" is called before the curl -s -X DELETE commands. This terminates the port-forwarding process, causing the subsequent curl deletion requests to fail with "Connection refused". As a result, the test resources (bug364-rd and bug364-rg) are not cleaned up from the cluster.

To fix this, perform the curl deletion requests before killing the port-forward process.

Suggested change
cleanup() {
kill "$PF_PID" 2>/dev/null || true
wait "$PF_PID" 2>/dev/null || true
curl -s -X DELETE "http://localhost:$PF_PORT/v1/resource-definitions/bug364-rd" >/dev/null 2>&1 || true
curl -s -X DELETE "http://localhost:$PF_PORT/v1/resource-groups/bug364-rg" >/dev/null 2>&1 || true
}
cleanup() {
if [[ -n "${PF_PORT:-}" ]]; then
curl -s -X DELETE "http://localhost:$PF_PORT/v1/resource-definitions/bug364-rd" >/dev/null 2>&1 || true
curl -s -X DELETE "http://localhost:$PF_PORT/v1/resource-groups/bug364-rg" >/dev/null 2>&1 || true
fi
if [[ -n "${PF_PID:-}" ]]; then
kill "$PF_PID" 2>/dev/null || true
wait "$PF_PID" 2>/dev/null || true
fi
}

trap cleanup EXIT

for _ in $(seq 1 30); do
if curl -sf -m1 "http://localhost:$PF_PORT/v1/nodes" >/dev/null 2>&1; then
break
fi
sleep 0.5
done
Comment on lines +49 to +54
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If the port-forwarding fails to start or bind, the wait loop will silently exhaust all 30 attempts and the script will continue, only to fail later with a less clear connection error. It is better to explicitly check if the port-forwarding succeeded and fail with a clear message and the port-forward logs if it didn't.

Suggested change
for _ in $(seq 1 30); do
if curl -sf -m1 "http://localhost:$PF_PORT/v1/nodes" >/dev/null 2>&1; then
break
fi
sleep 0.5
done
for i in {1..30}; do
if curl -sf -m1 "http://localhost:$PF_PORT/v1/nodes" >/dev/null 2>&1; then
break
fi
if [[ $i -eq 30 ]]; then
echo "FAIL: port-forward failed to start"
cat /tmp/bug364-pf.log
exit 1
fi
sleep 0.5
done


B="http://localhost:$PF_PORT"

# --- Setup: RG with only storage_pool_list ---
echo ">> seed bug364-rg with storage_pool_list (no storage_pool)"
curl -sf -X POST "$B/v1/resource-groups" -H "Content-Type: application/json" \
-d '{"name":"bug364-rg","select_filter":{"place_count":1,"storage_pool_list":["lvm-thin"]}}' >/dev/null

echo ">> seed bug364-rd in bug364-rg"
curl -sf -X POST "$B/v1/resource-definitions" -H "Content-Type: application/json" \
-d '{"resource_definition":{"name":"bug364-rd","resource_group_name":"bug364-rg"}}' >/dev/null

curl -sf -X POST "$B/v1/resource-definitions/bug364-rd/volume-definitions" \
-H "Content-Type: application/json" \
-d '{"volume_definition":{"size_kib":65536}}' >/dev/null

# --- Bug 364: r c <node> bug364-rd without --storage-pool ---
echo ">> POST resource-create on $WORKER_1 with empty body (linstor-csi shape)"
CODE=$(curl -s -o /tmp/bug364-resp.txt -w "%{http_code}" -X POST \
"$B/v1/resource-definitions/bug364-rd/resources/$WORKER_1" \
-H "Content-Type: application/json" \
-d '')
if [[ "$CODE" != "201" ]]; then
echo "FAIL: r c returned $CODE, expected 201"
cat /tmp/bug364-resp.txt
exit 1
fi
echo " 201 OK"

# --- Persistence: Props["StorPoolName"] must be stamped ---
POOL=$(curl -sf "$B/v1/resource-definitions/bug364-rd/resources" | python3 -c '
import sys, json
for r in json.load(sys.stdin):
props = r.get("props") or {}
print(props.get("StorPoolName", ""))
break
')
if [[ "$POOL" != "lvm-thin" ]]; then
echo "FAIL: Props[StorPoolName] = $POOL, expected lvm-thin (Bug 364)"
curl -sf "$B/v1/resource-definitions/bug364-rd/resources" | python3 -m json.tool
exit 1
fi
echo ">> Props[StorPoolName] = lvm-thin (Bug 364 fix OK)"

# --- Live convergence: the replica must reach UpToDate ---
echo ">> wait for replica UpToDate on $WORKER_1"
for _ in $(seq 1 30); do
STATE=$(curl -sf "$B/v1/resource-definitions/bug364-rd/resources" | python3 -c '
import sys, json
for r in json.load(sys.stdin):
state = r.get("state") or {}
print(state.get("drbd_state", ""))
break
')
if [[ "$STATE" == "UpToDate" ]]; then
echo " replica UpToDate after wait"
break
fi
sleep 2
done

if [[ "$STATE" != "UpToDate" ]]; then
echo "FAIL: replica never reached UpToDate (last state: $STATE)"
exit 1
fi

echo ">> PASS: Bug 364 — r c without --storage-pool resolves through RG StoragePoolList[0]"