Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sysdump: collect 'clustermesh-apiserver' + improvements #513

Merged
merged 5 commits into from
Sep 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion connectivity/check/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ func (ct *ConnectivityTest) waitForIPCache(ctx context.Context, pod Pod) error {
r := time.After(time.Second)

stdout, err := pod.K8sClient.ExecInPodWithTTY(ctx, pod.Pod.Namespace, pod.Pod.Name,
"cilium-agent", []string{"cilium", "bpf", "ipcache", "list", "-o", "json"})
defaults.AgentContainerName, []string{"cilium", "bpf", "ipcache", "list", "-o", "json"})
if err == nil {
var ic ipCache

Expand Down
4 changes: 2 additions & 2 deletions connectivity/check/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (t *Test) waitCiliumPolicyRevisions(ctx context.Context, revisions map[Pod]
// getCiliumPolicyRevision returns the current policy revision of a Cilium pod.
func getCiliumPolicyRevision(ctx context.Context, pod Pod) (int, error) {
stdout, err := pod.K8sClient.ExecInPodWithTTY(ctx, pod.Pod.Namespace, pod.Pod.Name,
"cilium-agent", []string{"cilium", "policy", "get", "-o", "jsonpath='{.revision}'"})
defaults.AgentContainerName, []string{"cilium", "policy", "get", "-o", "jsonpath='{.revision}'"})
if err != nil {
return 0, err
}
Expand All @@ -100,7 +100,7 @@ func getCiliumPolicyRevision(ctx context.Context, pod Pod) (int, error) {
func waitCiliumPolicyRevision(ctx context.Context, pod Pod, rev int, timeout time.Duration) error {
timeoutStr := strconv.Itoa(int(timeout.Seconds()))
_, err := pod.K8sClient.ExecInPodWithTTY(ctx, pod.Pod.Namespace, pod.Pod.Name,
"cilium-agent", []string{"cilium", "policy", "wait", strconv.Itoa(rev), "--max-wait-time", timeoutStr})
defaults.AgentContainerName, []string{"cilium", "policy", "wait", strconv.Itoa(rev), "--max-wait-time", timeoutStr})
return err
}

Expand Down
2 changes: 2 additions & 0 deletions defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package defaults
import "time"

const (
AgentContainerName = "cilium-agent"
AgentServiceAccountName = "cilium"
AgentClusterRoleName = "cilium"
AgentDaemonSetName = "cilium"
Expand All @@ -29,6 +30,7 @@ const (
HubbleSocketPath = "/var/run/cilium/hubble.sock"
HubbleServerSecretName = "hubble-server-certs"

RelayContainerName = "hubble-relay"
RelayDeploymentName = "hubble-relay"
RelayClusterRoleName = "hubble-relay"
RelayServiceAccountName = "hubble-relay"
Expand Down
2 changes: 1 addition & 1 deletion hubble/relay.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (k *K8sHubble) generateRelayDeployment() *appsv1.Deployment {
ServiceAccountName: defaults.RelayServiceAccountName,
Containers: []corev1.Container{
{
Name: "hubble-relay",
Name: defaults.RelayContainerName,
Command: []string{"hubble-relay"},
Args: []string{
"serve",
Expand Down
2 changes: 1 addition & 1 deletion install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ func (k *K8sInstaller) generateAgentDaemonSet() *appsv1.DaemonSet {
},
Containers: []corev1.Container{
{
Name: "cilium-agent",
Name: defaults.AgentContainerName,
Command: []string{"cilium-agent"},
Args: []string{"--config-dir=/tmp/cilium/config-map"},
Image: k.fqAgentImage(),
Expand Down
15 changes: 3 additions & 12 deletions internal/cli/cmd/sysdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,15 @@ func newCmdSysdump() *cobra.Command {
cmd.Flags().StringVar(&sysdumpOptions.CiliumOperatorLabelSelector,
"cilium-operator-label-selector", sysdump.DefaultCiliumOperatorLabelSelector,
"The labels used to target Cilium operator pods")
cmd.Flags().StringVar(&sysdumpOptions.CiliumOperatorNamespace,
"cilium-operator-namespace", sysdump.DefaultCiliumOperatorNamespace,
"The namespace Cilium operator is running in")
cmd.Flags().StringVar(&sysdumpOptions.ClustermeshApiserverLabelSelector,
"clustermesh-apiserver-label-selector", sysdump.DefaultClustermeshApiserverLabelSelector,
"The labels used to target 'clustermesh-apiserver' pods")
cmd.Flags().BoolVar(&sysdumpOptions.Debug,
"debug", sysdump.DefaultDebug,
"Whether to enable debug logging")
cmd.Flags().StringVar(&sysdumpOptions.HubbleLabelSelector,
"hubble-label-selector", sysdump.DefaultHubbleLabelSelector,
"The labels used to target Hubble pods")
cmd.Flags().StringVar(&sysdumpOptions.HubbleNamespace,
"hubble-namespace", sysdump.DefaultHubbleNamespace,
"The namespace Hubble is running in")
cmd.Flags().Int64Var(&sysdumpOptions.HubbleFlowsCount,
"hubble-flows-count", sysdump.DefaultHubbleFlowsCount,
"Number of Hubble flows to collect. Setting to zero disables collecting Hubble flows.")
Expand All @@ -70,15 +67,9 @@ func newCmdSysdump() *cobra.Command {
cmd.Flags().StringVar(&sysdumpOptions.HubbleRelayLabelSelector,
"hubble-relay-labels", sysdump.DefaultHubbleRelayLabelSelector,
"The labels used to target Hubble Relay pods")
cmd.Flags().StringVar(&sysdumpOptions.HubbleRelayNamespace,
"hubble-relay-namespace", sysdump.DefaultHubbleRelayNamespace,
"The namespace Hubble Relay is running in")
cmd.Flags().StringVar(&sysdumpOptions.HubbleUILabelSelector,
"hubble-ui-labels", sysdump.DefaultHubbleUILabelSelector,
"The labels used to target Hubble UI pods")
cmd.Flags().StringVar(&sysdumpOptions.HubbleUINamespace,
"hubble-ui-namespace", sysdump.DefaultHubbleUINamespace,
"The namespace Hubble UI is running in")
cmd.Flags().Int64Var(&sysdumpOptions.LogsLimitBytes,
"logs-limit-bytes", sysdump.DefaultLogsLimitBytes,
"The limit on the number of bytes to retrieve when collecting logs")
Expand Down
6 changes: 4 additions & 2 deletions internal/k8s/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ import (

// Register all auth providers (azure, gcp, oidc, openstack, ..).
_ "k8s.io/client-go/plugin/pkg/client/auth"

"github.com/cilium/cilium-cli/defaults"
)

type Client struct {
Expand Down Expand Up @@ -281,7 +283,7 @@ var logSplitter = regexp.MustCompile(`\r?\n[^ ]+ level=[[:alpha:]]+ msg=`)

func (c *Client) CiliumLogs(ctx context.Context, namespace, pod string, since time.Time, filter *regexp.Regexp) (string, error) {
opts := &corev1.PodLogOptions{
Container: "cilium-agent",
Container: defaults.AgentContainerName,
Timestamps: true,
SinceTime: &metav1.Time{Time: since},
}
Expand Down Expand Up @@ -371,7 +373,7 @@ func (c *Client) ExecInPod(ctx context.Context, namespace, pod, container string
}

func (c *Client) CiliumStatus(ctx context.Context, namespace, pod string) (*models.StatusResponse, error) {
stdout, err := c.ExecInPod(ctx, namespace, pod, "cilium-agent", []string{"cilium", "status", "-o", "json"})
stdout, err := c.ExecInPod(ctx, namespace, pod, defaults.AgentContainerName, []string{"cilium", "status", "-o", "json"})
if err != nil {
return nil, err
}
Expand Down
31 changes: 18 additions & 13 deletions sysdump/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,27 @@ package sysdump
import (
"regexp"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/cilium/cilium-cli/defaults"
)

const (
awsNodeDaemonSetName = "aws-node"
awsNodeDaemonSetNamespace = "kube-system"
ciliumAgentContainerName = "cilium-agent"
ciliumConfigConfigMapName = "cilium-config"
ciliumDaemonSetName = "cilium"
ciliumEtcdSecretsSecretName = "cilium-etcd-secrets"
ciliumOperatorDeploymentName = "cilium-operator"
hubbleContainerName = "hubble"
hubbleDaemonSetName = "hubble"
hubbleRelayContainerName = "hubble-relay"
hubbleRelayDeploymentName = "hubble-relay"
hubbleUIDeploymentName = "hubble-ui"
redacted = "XXXXXX"
awsNodeDaemonSetName = "aws-node"
awsNodeDaemonSetNamespace = metav1.NamespaceSystem
ciliumAgentContainerName = defaults.AgentContainerName
ciliumConfigConfigMapName = defaults.ConfigMapName
ciliumDaemonSetName = defaults.AgentDaemonSetName
ciliumEtcdSecretsSecretName = "cilium-etcd-secrets"
ciliumOperatorDeploymentName = defaults.OperatorDeploymentName
clustermeshApiserverDeploymentName = defaults.ClusterMeshDeploymentName
hubbleContainerName = "hubble"
hubbleDaemonSetName = "hubble"
hubbleRelayContainerName = defaults.RelayContainerName
hubbleRelayDeploymentName = defaults.RelayDeploymentName
hubbleUIDeploymentName = defaults.HubbleUIDeploymentName
redacted = "XXXXXX"
)

const (
Expand All @@ -39,6 +43,7 @@ const (
ciliumNetworkPoliciesFileName = "ciliumnetworkpolicies-<ts>.yaml"
ciliumNodesFileName = "ciliumnodes-<ts>.yaml"
ciliumOperatorDeploymentFileName = "cilium-operator-deployment-<ts>.yaml"
clustermeshApiserverDeploymentFileName = "clustermesh-apiserver-deployment-<ts>.yaml"
eniconfigsFileName = "aws-eniconfigs-<ts>.yaml"
gopsFileName = "gops-%s-%s-<ts>-%s.txt"
hubbleDaemonsetFileName = "hubble-daemonset-<ts>.yaml"
Expand Down
37 changes: 17 additions & 20 deletions sysdump/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,23 @@ const (
)

const (
DefaultCiliumLabelSelector = labelPrefix + "cilium"
DefaultCiliumNamespace = "kube-system"
DefaultCiliumOperatorLabelSelector = "io.cilium/app=operator"
DefaultCiliumOperatorNamespace = DefaultCiliumNamespace
DefaultDebug = false
DefaultHubbleLabelSelector = labelPrefix + "hubble"
DefaultHubbleNamespace = DefaultCiliumNamespace
DefaultHubbleFlowsCount = 10000
DefaultHubbleFlowsTimeout = 5 * time.Second
DefaultHubbleRelayLabelSelector = labelPrefix + "hubble-relay"
DefaultHubbleRelayNamespace = DefaultCiliumNamespace
DefaultHubbleUILabelSelector = labelPrefix + "hubble-ui"
DefaultHubbleUINamespace = DefaultCiliumNamespace
DefaultLargeSysdumpAbortTimeout = 5 * time.Second
DefaultLargeSysdumpThreshold = 20
DefaultLogsSinceTime = 8760 * time.Hour // 1y
DefaultLogsLimitBytes = 1073741824 // 1GiB
DefaultNodeList = ""
DefaultQuick = false
DefaultOutputFileName = "cilium-sysdump-<ts>" // "<ts>" will be replaced with the timestamp
DefaultCiliumLabelSelector = labelPrefix + "cilium"
DefaultCiliumNamespace = "kube-system"
DefaultCiliumOperatorLabelSelector = "io.cilium/app=operator"
DefaultClustermeshApiserverLabelSelector = labelPrefix + "clustermesh-apiserver"
DefaultDebug = false
DefaultHubbleLabelSelector = labelPrefix + "hubble"
DefaultHubbleFlowsCount = 10000
DefaultHubbleFlowsTimeout = 5 * time.Second
DefaultHubbleRelayLabelSelector = labelPrefix + "hubble-relay"
DefaultHubbleUILabelSelector = labelPrefix + "hubble-ui"
DefaultLargeSysdumpAbortTimeout = 5 * time.Second
DefaultLargeSysdumpThreshold = 20
DefaultLogsSinceTime = 8760 * time.Hour // 1y
DefaultLogsLimitBytes = 1073741824 // 1GiB
DefaultNodeList = ""
DefaultQuick = false
DefaultOutputFileName = "cilium-sysdump-<ts>" // "<ts>" will be replaced with the timestamp
)

var (
Expand Down
73 changes: 51 additions & 22 deletions sysdump/sysdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,20 @@ type Options struct {
CiliumNamespace string
// The labels used to target Cilium operator pods.
CiliumOperatorLabelSelector string
// The namespace Cilium operator is running in.
CiliumOperatorNamespace string
// The labels used to target 'clustermesh-apiserver' pods.
ClustermeshApiserverLabelSelector string
// Whether to enable debug logging.
Debug bool
// The labels used to target Hubble pods.
HubbleLabelSelector string
// The namespace Hubble is running in.
HubbleNamespace string
// Number of Hubble flows to collect.
HubbleFlowsCount int64
// Timeout for collecting Hubble flows.
HubbleFlowsTimeout time.Duration
// The labels used to target Hubble Relay pods.
HubbleRelayLabelSelector string
// The namespace Hubble Relay is running in.
HubbleRelayNamespace string
// The labels used to target Hubble UI pods.
HubbleUILabelSelector string
// The namespace Hubble UI is running in.
HubbleUINamespace string
// The amount of time to wait for the user to cancel the sysdump on a large cluster.
LargeSysdumpAbortTimeout time.Duration
// The threshold on the number of nodes present in the cluster that triggers a warning message.
Expand Down Expand Up @@ -347,7 +341,7 @@ func (c *Collector) Run() error {
v, err := c.client.GetSecret(ctx, c.options.CiliumNamespace, ciliumEtcdSecretsSecretName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
c.logDebug("secret %q not found in namespace %q - this is expected when using the CRD KVStore", ciliumEtcdSecretsSecretName, c.options.CiliumNamespace)
c.logDebug("Secret %q not found in namespace %q - this is expected when using the CRD KVStore", ciliumEtcdSecretsSecretName, c.options.CiliumNamespace)
return nil
}
return fmt.Errorf("failed to collect Cilium etcd secret: %w", err)
Expand Down Expand Up @@ -394,10 +388,10 @@ func (c *Collector) Run() error {
Description: "Collecting the Hubble daemonset",
Quick: true,
Task: func(ctx context.Context) error {
v, err := c.client.GetDaemonSet(ctx, c.options.HubbleNamespace, hubbleDaemonSetName, metav1.GetOptions{})
v, err := c.client.GetDaemonSet(ctx, c.options.CiliumNamespace, hubbleDaemonSetName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
c.logDebug("daemonset %q not found in namespace %q - this is expected in recent versions of Cilium", hubbleDaemonSetName, c.options.HubbleNamespace)
c.logDebug("Daemonset %q not found in namespace %q - this is expected in recent versions of Cilium", hubbleDaemonSetName, c.options.CiliumNamespace)
return nil
}
return fmt.Errorf("failed to collect the Hubble daemonset: %w", err)
Expand All @@ -412,10 +406,10 @@ func (c *Collector) Run() error {
Description: "Collecting the Hubble Relay deployment",
Quick: true,
Task: func(ctx context.Context) error {
v, err := c.client.GetDeployment(ctx, c.options.HubbleRelayNamespace, hubbleRelayDeploymentName, metav1.GetOptions{})
v, err := c.client.GetDeployment(ctx, c.options.CiliumNamespace, hubbleRelayDeploymentName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
c.logWarn("deployment %q not found in namespace %q", hubbleRelayDeploymentName, c.options.HubbleRelayNamespace)
c.logWarn("Deployment %q not found in namespace %q - this is expected if Hubble is not enabled", hubbleRelayDeploymentName, c.options.CiliumNamespace)
return nil
}
return fmt.Errorf("failed to collect the Hubble Relay deployment: %w", err)
Expand All @@ -430,10 +424,10 @@ func (c *Collector) Run() error {
Description: "Collecting the Hubble UI deployment",
Quick: true,
Task: func(ctx context.Context) error {
v, err := c.client.GetDeployment(ctx, c.options.HubbleUINamespace, hubbleUIDeploymentName, metav1.GetOptions{})
v, err := c.client.GetDeployment(ctx, c.options.CiliumNamespace, hubbleUIDeploymentName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
c.logWarn("deployment %q not found in namespace %q", hubbleUIDeploymentName, c.options.HubbleUINamespace)
c.logWarn("Deployment %q not found in namespace %q - this is expected if Hubble UI is not enabled", hubbleUIDeploymentName, c.options.CiliumNamespace)
return nil
}
return fmt.Errorf("failed to collect the Hubble UI deployment: %w", err)
Expand All @@ -458,6 +452,24 @@ func (c *Collector) Run() error {
return nil
},
},
{
Description: "Collecting the 'clustermesh-apiserver' deployment",
Quick: true,
Task: func(ctx context.Context) error {
v, err := c.client.GetDeployment(ctx, c.options.CiliumNamespace, clustermeshApiserverDeploymentName, metav1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
c.logWarn("Deployment %q not found in namespace %q - this is expected if 'clustermesh-apiserver' isn't enabled", clustermeshApiserverDeploymentName, c.options.CiliumNamespace)
return nil
}
return fmt.Errorf("failed to collect the 'clustermesh-apiserver' deployment: %w", err)
}
if err := writeYaml(absoluteTempPath(clustermeshApiserverDeploymentFileName), v); err != nil {
return fmt.Errorf("failed to collect the 'clustermesh-apiserver' deployment: %w", err)
}
return nil
},
},
{
CreatesSubtasks: true,
Description: "Collecting gops stats from Cilium pods",
Expand All @@ -480,7 +492,7 @@ func (c *Collector) Run() error {
Description: "Collecting gops stats from Hubble pods",
Quick: true,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.HubbleNamespace, metav1.ListOptions{
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.HubbleLabelSelector,
})
if err != nil {
Expand All @@ -497,7 +509,7 @@ func (c *Collector) Run() error {
Description: "Collecting gops stats from Hubble Relay pods",
Quick: true,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.HubbleNamespace, metav1.ListOptions{
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.HubbleRelayLabelSelector,
})
if err != nil {
Expand Down Expand Up @@ -548,7 +560,7 @@ func (c *Collector) Run() error {
Description: "Collecting logs from Cilium operator pods",
Quick: false,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.CiliumOperatorNamespace, metav1.ListOptions{
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.CiliumOperatorLabelSelector,
})
if err != nil {
Expand All @@ -560,12 +572,29 @@ func (c *Collector) Run() error {
return nil
},
},
{
CreatesSubtasks: true,
Description: "Collecting logs from 'clustermesh-apiserver' pods",
Quick: false,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.ClustermeshApiserverLabelSelector,
})
if err != nil {
return fmt.Errorf("failed to get logs from 'clustermesh-apiserver' pods")
}
if err := c.submitLogsTasks(ctx, filterPods(p, nodeList), c.options.LogsSinceTime, c.options.LogsLimitBytes, absoluteTempPath); err != nil {
return fmt.Errorf("failed to collect logs from 'clustermesh-apiserver' pods")
}
return nil
},
},
{
CreatesSubtasks: true,
Description: "Collecting logs from Hubble pods",
Quick: false,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.HubbleNamespace, metav1.ListOptions{
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.HubbleLabelSelector,
})
if err != nil {
Expand All @@ -582,7 +611,7 @@ func (c *Collector) Run() error {
Description: "Collecting logs from Hubble Relay pods",
Quick: false,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.HubbleRelayNamespace, metav1.ListOptions{
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.HubbleRelayLabelSelector,
})
if err != nil {
Expand All @@ -599,8 +628,8 @@ func (c *Collector) Run() error {
Description: "Collecting logs from Hubble UI pods",
Quick: false,
Task: func(ctx context.Context) error {
p, err := c.client.ListPods(ctx, c.options.HubbleNamespace, metav1.ListOptions{
LabelSelector: c.options.HubbleLabelSelector,
p, err := c.client.ListPods(ctx, c.options.CiliumNamespace, metav1.ListOptions{
LabelSelector: c.options.HubbleUILabelSelector,
})
if err != nil {
return fmt.Errorf("failed to get logs from Hubble UI pods")
Expand Down