Skip to content

Commit

Permalink
fix container_oom_events_total always returns 0.
Browse files Browse the repository at this point in the history
In a Kubernetes pod, if a container is OOM-killed, it will be deleted and a new container will be created. Therefore, the `container_oom_events_total` metric will always be 0. It would be useful to preserve the metrics of the OOM-killed containers instead of deleting them.

Signed-off-by: joey <zchengjoey@gmail.com>
  • Loading branch information
chengjoey committed Mar 22, 2023
1 parent b4c955f commit dcbab71
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ func (m *manager) Start() error {
return err
}
klog.V(2).Infof("Starting recovery of all containers")
err = m.detectSubcontainers("/")
err = m.detectSubContainers("/")
if err != nil {
return err
}
Expand Down Expand Up @@ -385,7 +385,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
start := time.Now()

// Check for new containers.
err := m.detectSubcontainers("/")
err := m.detectSubContainers("/")
if err != nil {
klog.Errorf("Failed to detect containers: %s", err)
}
Expand Down Expand Up @@ -1009,6 +1009,10 @@ func (m *manager) destroyContainerLocked(containerName string) error {
// Already destroyed, done.
return nil
}
// If the container is OOM-killed, keep monitoring metrics for the container.
if cont.oomEvents != 0 {
return nil
}

// Tell the container to stop.
err := cont.Stop()
Expand Down Expand Up @@ -1045,7 +1049,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {

// Detect all containers that have been added or deleted from the specified container.
func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) {
// Get all subcontainers recursively.
// Get all subContainers recursively.
m.containersLock.RLock()
cont, ok := m.containers[namespacedContainerName{
Name: containerName,
Expand All @@ -1067,8 +1071,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
// Determine which were added and which were removed.
allContainersSet := make(map[string]*containerData)
for name, d := range m.containers {
// Only add the canonical name.
if d.info.Name == name.Name {
// Only add the canonical name. Preserve containers that were deleted due to OOMKilled.
if d.info.Name == name.Name && d.oomEvents != 0 {
allContainersSet[name.Name] = d
}
}
Expand All @@ -1092,8 +1096,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
return
}

// Detect the existing subcontainers and reflect the setup here.
func (m *manager) detectSubcontainers(containerName string) error {
// Detect the existing subContainers and reflect the setup here.
func (m *manager) detectSubContainers(containerName string) error {
added, removed, err := m.getContainersDiff(containerName)
if err != nil {
return err
Expand Down Expand Up @@ -1136,7 +1140,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
}

// There is a race between starting the watch and new container creation so we do a detection before we read new containers.
err := m.detectSubcontainers("/")
err := m.detectSubContainers("/")
if err != nil {
return err
}
Expand Down

0 comments on commit dcbab71

Please sign in to comment.