-
Notifications
You must be signed in to change notification settings - Fork 0
/
pod_pending_investigator.go
202 lines (180 loc) · 9.56 KB
/
pod_pending_investigator.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/*
* Copyright FMR LLC <opensource@fidelity.com>
*
* SPDX-License-Identifier: Apache
*/
package investigators
import (
"context"
"fmt"
"regexp"
"strings"
"sync"
"github.com/fidelity/theliv/internal/problem"
v1 "k8s.io/api/core/v1"
)
const (
NodesNotAvailable = "0/.* nodes are available"
PendingNodeSelector = "didn't match .*selector"
PendingNodeAffinity = "didn't match .*affinity"
PendingNodeTaint = "untolerated taint"
PendingNodeUnschedulable = "unschedulable"
PendingInsufficient = "Insufficient"
PendingNoHostPort = "node(s) didn't have free ports"
PendingPVCGetErr = "error getting PVC"
PendingPVCNotFound = "persistentvolumeclaim .* not found"
PendingUnboundPVC = "pod has unbound immediate PersistentVolumeClaims"
PendingBindFailed = "Failed to bind volumes"
PendingCmNotFound = "configmap .* not found"
PendingSecretNotFound = "secret .* not found" //nolint:gosec
)
const (
FailedSchedulingMessage = "%d. Pod failed scheduling, message is: %s."
NodeUnavailableSolution = "%d. No node is available for the Pod, you may need to add new Node. Refer to: https://kubernetes.io/docs/tasks/debug-application-cluster/debug-cluster/"
PendingNodeUnschedulableSolution = "%d. Some nodes are unschedulable, try to uncordon these nodes may fix this."
PendingNodeSelectorSolution = "%d. Some nodes don't match the Pod node-selector/affinity, can check and adjust Pod node-selector/affinity. Refer to: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity"
PendingNodeTaintSolution = "%d. Some node(s) had taints, that the pod didn't tolerate. Try to modify the pod to tolerate 1 of them. Refer to: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/"
PendingInsufficientSolution = "%d. Some available node(s) has insufficient resources, check the resources that the pod requests or limits, try to modify them to applicable quota."
PendingNoHostPortSolution = "%d. Available node(s) didn't have free ports for the requested pod ports. Please check the HostPort used in the Pod, change/remove it is suggested."
PVCNotFoundSolution = "2. Pod {{ .ObjectMeta.Name }} is pending, used PVC not found." + KubectlPodAndPVC
PVCUnboundSolution = "2. Pod {{ .ObjectMeta.Name }} is pending, due to use an unbound PVC." + KubectlPodAndPVC
KubectlPodAndPVC = `
3. Please check PVC used by the pod, create new or choose an existing PVC may solve this problem. Refer to: https://kubernetes.io/docs/concepts/storage/persistent-volumes/`
ContainerFailMount = "%d. Container failed mount, message is: %s."
ContainerFailMountSolution = "%d. Please check your volumes of the Pod, try to change to correct and existing resources may fix this problem."
CmNotFoundSolution = "%d. Please check the configMap that mount, try to change to an existing configMap may fix this issue."
SecretNotFoundSolution = "%d. Please check the secret that mount, try to change to an existing secret may fix this issue."
PendingUnknownSolution = `
1. Pod {{ .ObjectMeta.Name }} is in Pending state for more than 5 mins.
2. The root cause can be any of below:
3. Your target node(s) may not be available, you may can restart or create new node.
4. Node(s) may have labels or taints, try to change your pod node-selector, affinity, tolerance, may fix this.
5. Existing node(s) may don't have enough resources, try to change your pod resource requests may fix this.
6. If you use any volumes, please make sure the resources you want to mount exists.
`
KubeDescribePoCmd = "%d. kubectl describe po {{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}"
GetEventsCmd = "%d. kubectl get events --field-selector involvedObject.name={{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}"
GetNoAllCmd = "%d. kubectl get no"
GetNoAllocatableCmd = "%d. kubectl get no -o custom-columns=NAME:.metadata.name,ALLOCATABLE:.status.allocatable --no-headers"
GetNoLabelCmd = "%d. kubectl get no --show-labels"
GetNoTaintCmd = "%d. kubectl get no -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints --no-headers"
UncordonCmd = "%d. kubectl uncordon <node name>"
GetPvcCmd = "%d. kubectl get pvc -n {{ .ObjectMeta.Namespace }}"
GetCmCmd = "%d. kubectl get cm -n {{ .ObjectMeta.Namespace }}"
GetSecretCmd = "%d. kubectl get secret -n {{ .ObjectMeta.Namespace }}"
)
func PodNotRunningInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, input *problem.DetectorCreationInput) {
defer wg.Done()
pod := *problem.AffectedResources.Resource.(*v1.Pod)
var solutions []string
var commands []string
failScheduleEvent := getPoEventMsg(ctx, input, &pod, "FailedScheduling")
failMount := getPoEventMsg(ctx, input, &pod, "FailedMount")
if len(failScheduleEvent) > 0 {
failSchedule := failScheduleEvent[0]
solutions = []string{fmt.Sprintf(FailedSchedulingMessage, 1, failSchedule)}
commands = []string{}
if msgMatch(PendingPVCGetErr, failSchedule) {
solutions, commands = appendPVSolution(ctx, pod, solutions, PVCNotFoundSolution)
} else if msgMatch(PendingPVCNotFound, failSchedule) {
solutions, commands = appendPVSolution(ctx, pod, solutions, PVCNotFoundSolution)
} else if msgMatch(PendingUnboundPVC, failSchedule) {
solutions, commands = appendPVSolution(ctx, pod, solutions, PVCUnboundSolution)
} else if msgMatch(PendingBindFailed, failSchedule) {
solutions, commands = appendPVSolution(ctx, pod, solutions, PVCUnboundSolution)
} else if msgMatch(NodesNotAvailable, failSchedule) {
solutions = appendSeq(solutions, NodeUnavailableSolution)
commands = appendSeq(commands, GetNoAllCmd)
commands = appendSeq(commands, GetNoAllocatableCmd)
if msgMatch(PendingNodeUnschedulable, failSchedule) {
solutions = appendSeq(solutions, PendingNodeUnschedulableSolution)
commands = appendSeq(commands, UncordonCmd)
}
if msgMatch(PendingNodeSelector, failSchedule) || msgMatch(PendingNodeAffinity, failSchedule) {
solutions = appendSeq(solutions, PendingNodeSelectorSolution)
commands = appendSeq(commands, GetNoLabelCmd)
}
if msgMatch(PendingNodeTaint, failSchedule) {
solutions = appendSeq(solutions, PendingNodeTaintSolution)
commands = appendSeq(commands, GetNoTaintCmd)
}
if msgMatch(PendingInsufficient, failSchedule) {
solutions = appendSeq(solutions, PendingInsufficientSolution)
}
if msgMatch(PendingNoHostPort, failSchedule) {
solutions = appendSeq(solutions, PendingNoHostPortSolution)
}
} else {
solutions, commands = getPendingPodUnknownSolution(ctx, pod)
}
} else if len(failMount) > 0 {
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, KubeDescribePoCmd, pod, true)[0])
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, GetEventsCmd, pod, true)[0])
for _, event := range failMount {
if msgMatch(PendingCmNotFound, event) {
solutions = append(solutions, fmt.Sprintf(ContainerFailMount, 1, event))
solutions = appendSeq(solutions, CmNotFoundSolution)
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, GetCmCmd, pod, true)[0])
}
if msgMatch(PendingSecretNotFound, event) {
solutions = append(solutions, fmt.Sprintf(ContainerFailMount, 1, event))
solutions = appendSeq(solutions, SecretNotFoundSolution)
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, GetSecretCmd, pod, true)[0])
}
}
if len(solutions) == 0 {
solutions = appendSeq(solutions, fmt.Sprintf(ContainerFailMount, 1, failMount[0]))
solutions = appendSeq(solutions, ContainerFailMountSolution)
}
} else {
solutions, commands = getPendingPodUnknownSolution(ctx, pod)
}
appendSolution(problem, solutions, commands)
}
func appendPVSolution(ctx context.Context, po v1.Pod, solutions []string, solution string) ([]string, []string) {
addSolutions := GetSolutionsByTemplate(ctx, solution, po, true)
solutions = append(solutions, addSolutions...)
var commands []string
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, KubeDescribePoCmd, po, true)[0])
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, GetEventsCmd, po, true)[0])
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, GetPvcCmd, po, true)[0])
return solutions, commands
}
func appendSeq(solution []string, message string) []string {
order := len(solution) + 1
return append(solution, fmt.Sprintf(message, order))
}
func msgMatch(msg1 string, msg2 string) bool {
matched, err := regexp.MatchString(strings.ToLower(msg1), strings.ToLower(msg2))
if matched && err == nil {
return true
}
return false
}
func getPoEventMsg(ctx context.Context, input *problem.DetectorCreationInput, pod *v1.Pod, reason string) (msg []string) {
events, err := GetPodEvents(ctx, input, pod)
if err != nil {
return
}
if len(events) > 0 {
for _, event := range events {
if event.Reason == reason && event.Message != "" {
msg = append(msg, event.Message)
}
}
}
return
}
func PodNotRunningSolutionsInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, input *problem.DetectorCreationInput) {
defer wg.Done()
// Generate solutions
// detail := "do something to provide solutions"
// problem.SolutionDetails = append(problem.SolutionDetails, detail)
}
func getPendingPodUnknownSolution(ctx context.Context, po v1.Pod) ([]string, []string) {
solutions := GetSolutionsByTemplate(ctx, PendingUnknownSolution, po, true)
var commands []string
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, KubeDescribePoCmd, po, true)[0])
commands = appendSeq(commands, GetSolutionsByTemplate(ctx, GetEventsCmd, po, true)[0])
return solutions, commands
}