-
Notifications
You must be signed in to change notification settings - Fork 291
/
faults.go
181 lines (162 loc) · 5.75 KB
/
faults.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
//
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
package server
import (
"fmt"
"sort"
"strings"
"github.com/dustin/go-humanize"
"github.com/daos-stack/daos/src/control/build"
"github.com/daos-stack/daos/src/control/fault"
"github.com/daos-stack/daos/src/control/fault/code"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/server/engine"
)
var (
FaultUnknown = serverFault(
code.ServerUnknown,
"unknown control server error",
"",
)
FaultIommuDisabled = serverFault(
code.ServerIommuDisabled,
"no IOMMU detected while running as non-root user with NVMe devices",
"enable IOMMU per the DAOS Admin Guide or run daos_server as root",
)
FaultVfioDisabled = serverFault(
code.ServerVfioDisabled,
"disable_vfio: true in config while running as non-root user with NVMe devices",
"set disable_vfio: false or run daos_server as root",
)
FaultHarnessNotStarted = serverFault(
code.ServerHarnessNotStarted,
fmt.Sprintf("%s harness not started", build.DataPlaneName),
"retry the operation or check server logs for more details",
)
FaultDataPlaneNotStarted = serverFault(
code.ServerDataPlaneNotStarted,
fmt.Sprintf("%s instance not started or not responding on dRPC", build.DataPlaneName),
"retry the operation or check server logs for more details",
)
FaultPoolNoLabel = serverFault(
code.ServerPoolNoLabel,
"cannot create a pool without a pool label",
"retry the operation with a label set",
)
FaultPoolHasContainers = serverFault(
code.ServerPoolHasContainers,
"cannot destroy a pool with existing containers",
"retry the operation with the recursive flag set to remove containers along with the pool",
)
FaultHugepagesDisabled = serverFault(
code.ServerHugepagesDisabled,
"the use of hugepages has been disabled in the server config",
"set false (or remove) disable_hugepages parameter in config and reformat storage, then retry the operation",
)
)
func FaultPoolInvalidServiceReps(maxSvcReps uint32) *fault.Fault {
return serverFault(
code.ServerPoolInvalidServiceReps,
fmt.Sprintf("pool service replicas number should be an odd number between 1 and %d", maxSvcReps),
"retry the request with a valid number of pool service replicas",
)
}
func FaultInstancesNotStopped(action string, rank ranklist.Rank) *fault.Fault {
return serverFault(
code.ServerInstancesNotStopped,
fmt.Sprintf("%s not supported when rank %d is running", action, rank),
fmt.Sprintf("retry %s operation after stopping rank %d", action, rank),
)
}
func FaultPoolNvmeTooSmall(minTotal, minNVMe uint64) *fault.Fault {
return serverFault(
code.ServerPoolNvmeTooSmall,
fmt.Sprintf("requested NVMe capacity too small (min %s per target)",
humanize.IBytes(engine.NvmeMinBytesPerTarget)),
fmt.Sprintf("retry the request with a pool size of at least %s, with at least %s NVMe",
humanize.Bytes(minTotal+humanize.MiByte), humanize.Bytes(minNVMe+humanize.MiByte),
),
)
}
func FaultPoolScmTooSmall(minTotal, minSCM uint64) *fault.Fault {
return serverFault(
code.ServerPoolScmTooSmall,
fmt.Sprintf("requested SCM capacity is too small (min %s per target)",
humanize.IBytes(engine.ScmMinBytesPerTarget)),
fmt.Sprintf("retry the request with a pool size of at least %s, with at least %s SCM",
humanize.Bytes(minTotal+humanize.MiByte), humanize.Bytes(minSCM+humanize.MiByte),
),
)
}
func FaultPoolInvalidRanks(invalid []ranklist.Rank) *fault.Fault {
rs := make([]string, len(invalid))
for i, r := range invalid {
rs[i] = r.String()
}
sort.Strings(rs)
return serverFault(
code.ServerPoolInvalidRanks,
fmt.Sprintf("pool request contains invalid ranks: %s", strings.Join(rs, ",")),
"retry the request with a valid set of ranks",
)
}
func FaultPoolInvalidNumRanks(req, avail int) *fault.Fault {
return serverFault(
code.ServerPoolInvalidNumRanks,
fmt.Sprintf("pool request contains invalid number of ranks (requested: %d, available: %d)", req, avail),
"retry the request with a valid number of ranks",
)
}
func FaultPoolDuplicateLabel(dupe string) *fault.Fault {
return serverFault(
code.ServerPoolDuplicateLabel,
fmt.Sprintf("pool label %q already exists in the system", dupe),
"retry the request with a unique pool label",
)
}
func FaultEngineNUMAImbalance(nodeMap map[int]int) *fault.Fault {
return serverFault(
code.ServerConfigEngineNUMAImbalance,
fmt.Sprintf("uneven distribution of engines across NUMA nodes %v", nodeMap),
"config requires an equal number of engines assigned to each NUMA node",
)
}
func FaultScmUnmanaged(mntPoint string) *fault.Fault {
return serverFault(
code.ServerScmUnmanaged,
fmt.Sprintf("the SCM mountpoint at %s is unavailable and can't be created/mounted", mntPoint),
fmt.Sprintf("manually create %s and retry", mntPoint),
)
}
func FaultWrongSystem(reqName, sysName string) *fault.Fault {
return serverFault(
code.ServerWrongSystem,
fmt.Sprintf("request system does not match running system (%s != %s)", reqName, sysName),
"retry the request with the correct system name",
)
}
func FaultIncompatibleComponents(self, other *build.VersionedComponent) *fault.Fault {
return serverFault(
code.ServerIncompatibleComponents,
fmt.Sprintf("components %s and %s are not compatible", self, other),
"retry the request with compatible components",
)
}
func FaultNoCompatibilityInsecure(self, other build.Version) *fault.Fault {
return serverFault(
code.ServerNoCompatibilityInsecure,
fmt.Sprintf("versions %s and %s are not compatible in insecure mode", self, other),
"enable certificates or use identical component versions",
)
}
func serverFault(code code.Code, desc, res string) *fault.Fault {
return &fault.Fault{
Domain: "server",
Code: code,
Description: desc,
Resolution: res,
}
}