Skip to content

Commit

Permalink
feat: Add check K8S011 for version skew between kube-proxy and `k…
Browse files Browse the repository at this point in the history
…ubelet` (#39)
  • Loading branch information
bryantbiggs committed Mar 11, 2023
1 parent 6e94e1a commit 7980b93
Show file tree
Hide file tree
Showing 12 changed files with 275 additions and 113 deletions.
20 changes: 19 additions & 1 deletion docs/process/checks.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ Either `.spec.affinity.podAntiAffinity` or `.spec.topologySpreadConstraints` is

#### K8S006

A `readinessProbe` must be set to ensure traffic is not sent to pods before they are ready following their re-deployment from a node replacement.
**❌ Remediation required**

A `readinessProbe` must be set to ensure traffic is not routed to pods before they are ready following their re-deployment from a node replacement.

#### K8S007

Expand Down Expand Up @@ -272,3 +274,19 @@ For clusters on Kubernetes <`v1.21`
[Amazon EBS CSI migration frequently asked questions](https://docs.aws.amazon.com/eks/latest/userguide/ebs-csi-migration-faq.html)

[Kubernetes In-Tree to CSI Volume Migration Status Update](https://kubernetes.io/blog/2021/12/10/storage-in-tree-to-csi-migration-status-update/)

#### K8S011

**❌ Remediation required**

`kube-proxy` on an Amazon EKS cluster has the same [compatibility and skew policy as Kubernetes](https://kubernetes.io/releases/version-skew-policy/#kube-proxy)

- It must be the same minor version as kubelet on your Amazon EC2 nodes
- It cannot be newer than the minor version of your cluster's control plane
- Its version on your Amazon EC2 nodes can't be more than two minor versions older than your control plane. For example, if your control plane is running Kubernetes `1.25`, then the kube-proxy minor version cannot be older than `1.23`

If you recently updated your cluster to a new Kubernetes minor version, then update your Amazon EC2 nodes (i.e. - `kubelet`) to the same minor version before updating `kube-proxy` to the same minor version as your nodes. The order of operations during an upgrade are as follows:

1. Update the control plane to the new Kubernetes minor version
2. Update the nodes, which updates `kubelet`, to the new Kubernetes minor version
3. Update `kube-proxy` to the new Kubernetes minor version
8 changes: 4 additions & 4 deletions eksup/src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ impl Results {
output.push_str(&self.data_plane.eks_managed_nodegroup_update.to_stdout_table()?);
output.push_str(&self.data_plane.self_managed_nodegroup_update.to_stdout_table()?);

output.push_str(&self.data_plane.version_skew.to_stdout_table()?);
output.push_str(&self.kubernetes.version_skew.to_stdout_table()?);
output.push_str(&self.kubernetes.min_replicas.to_stdout_table()?);
output.push_str(&self.kubernetes.min_ready_seconds.to_stdout_table()?);
output.push_str(&self.kubernetes.pod_topology_distribution.to_stdout_table()?);
output.push_str(&self.kubernetes.readiness_probe.to_stdout_table()?);
output.push_str(&self.kubernetes.termination_grace_period.to_stdout_table()?);
output.push_str(&self.kubernetes.docker_socket.to_stdout_table()?);
output.push_str(&self.kubernetes.pod_security_policy.to_stdout_table()?);
output.push_str(&self.kubernetes.kube_proxy_version_skew.to_stdout_table()?);

Ok(output)
}
Expand All @@ -59,9 +60,8 @@ pub(crate) async fn analyze(aws_shared_config: &aws_config::SdkConfig, cluster:
let cluster_findings = eks::get_cluster_findings(cluster).await?;
let subnet_findings = eks::get_subnet_findings(&ec2_client, &k8s_client, cluster).await?;
let addon_findings = eks::get_addon_findings(&eks_client, cluster_name, cluster_version).await?;
let dataplane_findings =
eks::get_data_plane_findings(&asg_client, &ec2_client, &eks_client, &k8s_client, cluster).await?;
let kubernetes_findings = k8s::get_kubernetes_findings(&k8s_client, &target_version).await?;
let dataplane_findings = eks::get_data_plane_findings(&asg_client, &ec2_client, &eks_client, cluster).await?;
let kubernetes_findings = k8s::get_kubernetes_findings(&k8s_client, cluster_version, &target_version).await?;

Ok(Results {
cluster: cluster_findings,
Expand Down
14 changes: 1 addition & 13 deletions eksup/src/eks/findings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@ use aws_sdk_eks::{model::Cluster, Client as EksClient};
use kube::Client as K8sClient;
use serde::{Deserialize, Serialize};

use crate::{
eks::{checks, resources},
k8s,
};
use crate::eks::{checks, resources};

/// Findings related to the cluster itself, primarily the control plane
#[derive(Debug, Serialize, Deserialize)]
Expand Down Expand Up @@ -96,11 +93,6 @@ pub async fn get_addon_findings(
/// (pods, deployments, etc.)
#[derive(Debug, Serialize, Deserialize)]
pub struct DataPlaneFindings {
/// The skew/diff between the cluster control plane (API Server) and the nodes in the data plane (kubelet)
/// It is recommended that these versions are aligned prior to upgrading, and changes are required when
/// the skew policy could be violated post upgrade (i.e. if current skew is +2, the policy would be violated
/// as soon as the control plane is upgraded, resulting in +3, and therefore changes are required before upgrade)
pub version_skew: Vec<k8s::VersionSkew>,
/// The health of the EKS managed node groups as reported by the Amazon EKS managed node group API
pub eks_managed_nodegroup_health: Vec<checks::NodegroupHealthIssue>,
/// Will show if the current launch template provided to the Amazon EKS managed node group is NOT the latest
Expand All @@ -123,17 +115,14 @@ pub async fn get_data_plane_findings(
asg_client: &AsgClient,
ec2_client: &Ec2Client,
eks_client: &EksClient,
k8s_client: &kube::Client,
cluster: &Cluster,
) -> Result<DataPlaneFindings> {
let cluster_name = cluster.name().unwrap_or_default();
let cluster_version = cluster.version().unwrap_or_default();

let eks_mngs = resources::get_eks_managed_nodegroups(eks_client, cluster_name).await?;
let self_mngs = resources::get_self_managed_nodegroups(asg_client, cluster_name).await?;
let fargate_profiles = resources::get_fargate_profiles(eks_client, cluster_name).await?;

let version_skew = k8s::version_skew(k8s_client, cluster_version).await?;
let eks_managed_nodegroup_health = checks::eks_managed_nodegroup_health(&eks_mngs).await?;
let mut eks_managed_nodegroup_update = Vec::new();
for eks_mng in &eks_mngs {
Expand All @@ -148,7 +137,6 @@ pub async fn get_data_plane_findings(
}

Ok(DataPlaneFindings {
version_skew,
eks_managed_nodegroup_health,
eks_managed_nodegroup_update: eks_managed_nodegroup_update
.into_iter()
Expand Down
8 changes: 8 additions & 0 deletions eksup/src/finding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ pub enum Code {

/// Pod security policies present
K8S009,

/// EBS CSI driver not installed (v1.23+)
K8S010,

/// Kubernetes version skew detected between kube-proxy and kubelet
K8S011,
}

impl std::fmt::Display for Code {
Expand All @@ -171,6 +177,8 @@ impl std::fmt::Display for Code {
Code::K8S007 => write!(f, "K8S007"),
Code::K8S008 => write!(f, "K8S008"),
Code::K8S009 => write!(f, "K8S009"),
Code::K8S010 => write!(f, "K8S010"),
Code::K8S011 => write!(f, "K8S011"),
}
}
}
138 changes: 112 additions & 26 deletions eksup/src/k8s/checks.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};

use anyhow::Result;
use k8s_openapi::api::core;
use kube::{api::Api, Client};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use tabled::{locator::ByColumnName, Disable, Margin, Style, Table, Tabled};

use crate::{
finding::{self, Findings},
k8s::resources::Resource,
k8s::resources::{self, Resource},
version,
};

Expand Down Expand Up @@ -99,20 +97,12 @@ impl Findings for Vec<VersionSkew> {
}

/// Returns all of the nodes in the cluster
pub async fn version_skew(client: &Client, cluster_version: &str) -> Result<Vec<VersionSkew>> {
let api: Api<core::v1::Node> = Api::all(client.to_owned());
let node_list = api.list(&Default::default()).await?;

pub async fn version_skew(nodes: &[resources::Node], cluster_version: &str) -> Result<Vec<VersionSkew>> {
let mut findings = vec![];

for node in &node_list {
let status = node.status.as_ref().unwrap();
let node_info = status.node_info.as_ref().unwrap();
let kubelet_version = node_info.kubelet_version.to_owned();

let node_minor_version = version::parse_minor(&kubelet_version).unwrap();
for node in nodes {
let control_plane_minor_version = version::parse_minor(cluster_version)?;
let version_skew = control_plane_minor_version - node_minor_version;
let version_skew = control_plane_minor_version - node.minor_version;
if version_skew == 0 {
continue;
}
Expand All @@ -125,20 +115,18 @@ pub async fn version_skew(client: &Client, cluster_version: &str) -> Result<Vec<
_ => finding::Remediation::Required,
};

if let Some(labels) = &node.metadata.labels {
if let Some(labels) = &node.labels {
if labels.contains_key("eks.amazonaws.com/nodegroup") {
// Nodes created by EKS managed nodegroups are required to match control plane
// before the control plane will permit an upgrade
remediation = finding::Remediation::Required;
}
}

if let Some(name) = &node.metadata.name {
if name.starts_with("fargate-") {
// Nodes created by EKS Fargate are required to match control plane
// before the control plane will permit an upgrade
remediation = finding::Remediation::Required;
}
if node.name.starts_with("fargate-") {
// Nodes created by EKS Fargate are required to match control plane
// before the control plane will permit an upgrade
remediation = finding::Remediation::Required;
}

let finding = finding::Finding {
Expand All @@ -149,9 +137,9 @@ pub async fn version_skew(client: &Client, cluster_version: &str) -> Result<Vec<

let node = VersionSkew {
finding,
name: node.metadata.name.as_ref().unwrap().to_owned(),
kubelet_version: kubelet_version.to_owned(),
kubernetes_version: format!("v{}", version::normalize(&kubelet_version).unwrap()),
name: node.name.to_owned(),
kubelet_version: node.kubelet_version.to_owned(),
kubernetes_version: format!("v{}", version::normalize(&node.kubelet_version).unwrap()),
control_plane_version: format!("v{cluster_version}"),
version_skew: format!("+{version_skew}"),
};
Expand Down Expand Up @@ -456,6 +444,104 @@ impl Findings for Vec<PodSecurityPolicy> {
}
}

#[derive(Clone, Debug, Serialize, Deserialize, Tabled)]
#[tabled(rename_all = "UpperCase")]
pub struct KubeProxyVersionSkew {
#[tabled(inline)]
pub finding: finding::Finding,
#[tabled(rename = "KUBELET")]
pub kubelet_version: String,
#[tabled(rename = "KUBE PROXY")]
pub kube_proxy_version: String,
#[tabled(rename = "SKEW")]
pub version_skew: String,
}

pub async fn kube_proxy_version_skew(
nodes: &[resources::Node],
resources: &[resources::StdResource],
) -> Result<Vec<KubeProxyVersionSkew>> {
let kube_proxy = match resources
.iter()
.filter(|r| r.metadata.kind == resources::Kind::DaemonSet && r.metadata.name == "kube-proxy")
.collect::<Vec<_>>()
.get(0)
{
Some(k) => k.to_owned(),
None => {
println!("Unable to find kube-proxy");
return Ok(vec![]);
}
};

let ptmpl = kube_proxy.spec.template.as_ref().unwrap();
let pspec = ptmpl.spec.as_ref().unwrap();
let kproxy_minor_version = pspec
.containers
.iter()
.map(|container| {
// TODO - this seems brittle
let image = container.image.as_ref().unwrap().split(':').collect::<Vec<_>>()[1];
version::parse_minor(image).unwrap()
})
.next()
.context("Unable to find image version for kube-proxy")?;

let findings = nodes
.iter()
.map(|node| node.minor_version)
.collect::<HashSet<_>>()
.into_iter()
.filter(|node_ver| node_ver != &kproxy_minor_version)
.map(|node_ver| {
let remediation = finding::Remediation::Required;
let finding = finding::Finding {
code: finding::Code::K8S011,
symbol: remediation.symbol(),
remediation,
};

KubeProxyVersionSkew {
finding,
kubelet_version: format!("v1.{node_ver}"),
kube_proxy_version: format!("v1.{kproxy_minor_version}"),
version_skew: format!("{}", kproxy_minor_version - node_ver),
}
})
.collect();

Ok(findings)
}

impl Findings for Vec<KubeProxyVersionSkew> {
fn to_markdown_table(&self, leading_whitespace: &str) -> Result<String> {
if self.is_empty() {
return Ok(format!(
"{leading_whitespace}✅ - `kube-proxy` version is aligned with the node/`kubelet` versions in use"
));
}

let mut table = Table::new(self);
table
.with(Disable::column(ByColumnName::new("CHECK")))
.with(Margin::new(1, 0, 0, 0).set_fill('\t', 'x', 'x', 'x'))
.with(Style::markdown());

Ok(format!("{table}\n"))
}

fn to_stdout_table(&self) -> Result<String> {
if self.is_empty() {
return Ok("".to_owned());
}

let mut table = Table::new(self);
table.with(Style::sharp());

Ok(format!("{table}\n"))
}
}

pub trait K8sFindings {
fn get_resource(&self) -> Resource;

Expand Down
21 changes: 18 additions & 3 deletions eksup/src/k8s/findings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,30 @@ use crate::k8s::{

#[derive(Debug, Serialize, Deserialize)]
pub struct KubernetesFindings {
/// The skew/diff between the cluster control plane (API Server) and the nodes in the data plane (kubelet)
/// It is recommended that these versions are aligned prior to upgrading, and changes are required when
/// the skew policy could be violated post upgrade (i.e. if current skew is +2, the policy would be violated
/// as soon as the control plane is upgraded, resulting in +3, and therefore changes are required before upgrade)
pub version_skew: Vec<checks::VersionSkew>,
pub min_replicas: Vec<checks::MinReplicas>,
pub min_ready_seconds: Vec<checks::MinReadySeconds>,
pub readiness_probe: Vec<checks::Probe>,
pub pod_topology_distribution: Vec<checks::PodTopologyDistribution>,
pub termination_grace_period: Vec<checks::TerminationGracePeriod>,
pub docker_socket: Vec<checks::DockerSocket>,
pub pod_security_policy: Vec<checks::PodSecurityPolicy>,
pub kube_proxy_version_skew: Vec<checks::KubeProxyVersionSkew>,
}

pub async fn get_kubernetes_findings(k8s_client: &K8sClient, target_version: &str) -> Result<KubernetesFindings> {
let resources = resources::get_resources(k8s_client).await?;
pub async fn get_kubernetes_findings(
client: &K8sClient,
cluster_version: &str,
target_version: &str,
) -> Result<KubernetesFindings> {
let resources = resources::get_resources(client).await?;
let nodes = resources::get_nodes(client).await?;

let version_skew = checks::version_skew(&nodes, cluster_version).await?;
let min_replicas: Vec<checks::MinReplicas> = resources.iter().filter_map(|s| s.min_replicas()).collect();
let min_ready_seconds: Vec<checks::MinReadySeconds> =
resources.iter().filter_map(|s| s.min_ready_seconds()).collect();
Expand All @@ -33,15 +45,18 @@ pub async fn get_kubernetes_findings(k8s_client: &K8sClient, target_version: &st
.iter()
.filter_map(|s| s.docker_socket(target_version))
.collect();
let pod_security_policy = resources::get_podsecuritypolicies(k8s_client, target_version).await?;
let pod_security_policy = resources::get_podsecuritypolicies(client, target_version).await?;
let kube_proxy_version_skew = checks::kube_proxy_version_skew(&nodes, &resources).await?;

Ok(KubernetesFindings {
version_skew,
min_replicas,
min_ready_seconds,
readiness_probe,
pod_topology_distribution,
termination_grace_period,
docker_socket,
pod_security_policy,
kube_proxy_version_skew,
})
}
1 change: 0 additions & 1 deletion eksup/src/k8s/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@ mod checks;
mod findings;
mod resources;

pub use checks::{version_skew, VersionSkew};
pub use findings::{get_kubernetes_findings, KubernetesFindings};
pub use resources::get_eniconfigs;
Loading

0 comments on commit 7980b93

Please sign in to comment.