Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix container_oom_events_total always returns 0. #3278

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions cmd/cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"runtime"
"strings"
"syscall"
"time"

cadvisorhttp "github.com/google/cadvisor/cmd/internal/http"
"github.com/google/cadvisor/container"
Expand Down Expand Up @@ -75,6 +76,8 @@ var perfEvents = flag.String("perf_events_config", "", "Path to a JSON file cont

var resctrlInterval = flag.Duration("resctrl_interval", 0, "Resctrl mon groups updating interval. Zero value disables updating mon groups.")

var oomEventRetainDuration = flag.Duration("oom_event_retain_time", 5*time.Minute, "Duration for which to retain OOM events. Zero value disables retaining OOM events.")

var (
// Metrics to be ignored.
// Tcp metrics are ignored by default.
Expand Down Expand Up @@ -132,7 +135,17 @@ func main() {

collectorHTTPClient := createCollectorHTTPClient(*collectorCert, *collectorKey)

resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval)
containerLabelFunc := metrics.DefaultContainerLabels
if !*storeContainerLabels {
whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",")
// Trim spacing in labels
for i := range whitelistedLabels {
whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i])
}
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
}

resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHTTPClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval, containerLabelFunc, oomEventRetainDuration)
if err != nil {
klog.Fatalf("Failed to create a manager: %s", err)
}
Expand All @@ -152,16 +165,6 @@ func main() {
klog.Fatalf("Failed to register HTTP handlers: %v", err)
}

containerLabelFunc := metrics.DefaultContainerLabels
if !*storeContainerLabels {
whitelistedLabels := strings.Split(*whitelistedContainerLabels, ",")
// Trim spacing in labels
for i := range whitelistedLabels {
whitelistedLabels[i] = strings.TrimSpace(whitelistedLabels[i])
}
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
}

// Register Prometheus collector to gather information about containers, Go runtime, processes, and machine
cadvisorhttp.RegisterPrometheusHandler(mux, resourceManager, *prometheusEndpoint, containerLabelFunc, includedMetrics)

Expand Down
2 changes: 0 additions & 2 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -967,8 +967,6 @@ type ContainerStats struct {
Resctrl ResctrlStats `json:"resctrl,omitempty"`

CpuSet CPUSetStats `json:"cpuset,omitempty"`

OOMEvents uint64 `json:"oom_events,omitempty"`
}

func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {
Expand Down
4 changes: 0 additions & 4 deletions manager/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/cadvisor/cache/memory"
Expand Down Expand Up @@ -65,7 +64,6 @@ type containerInfo struct {
}

type containerData struct {
oomEvents uint64
handler container.ContainerHandler
info containerInfo
memoryCache *memory.InMemoryCache
Expand Down Expand Up @@ -669,8 +667,6 @@ func (cd *containerData) updateStats() error {
}
}

stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)

var customStatsErr error
cm := cd.collectorManager.(*collector.GenericCollectorManager)
if len(cm.Collectors) > 0 {
Expand Down
85 changes: 77 additions & 8 deletions manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
info "github.com/google/cadvisor/info/v1"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/metrics"
"github.com/google/cadvisor/nvm"
"github.com/google/cadvisor/perf"
"github.com/google/cadvisor/resctrl"
Expand Down Expand Up @@ -144,6 +145,8 @@ type Manager interface {
AllPodmanContainers(c *info.ContainerInfoRequest) (map[string]info.ContainerInfo, error)

PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error)

GetOOMInfos() map[string]*oomparser.ContainerOomInfo
}

// Housekeeping configuration for the manager
Expand All @@ -153,7 +156,9 @@ type HousekeepingConfig = struct {
}

// New takes a memory storage and returns a new manager.
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfig HousekeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) {
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfig HousekeepingConfig, includedMetricsSet container.MetricSet,
collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string,
perfEventsFile string, resctrlInterval time.Duration, f metrics.ContainerLabelsFunc, oomRetainDuration *time.Duration) (Manager, error) {
if memoryCache == nil {
return nil, fmt.Errorf("manager requires memory storage")
}
Expand Down Expand Up @@ -208,6 +213,9 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, HousekeepingConfi
collectorHTTPClient: collectorHTTPClient,
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
containerEnvMetadataWhiteList: containerEnvMetadataWhiteList,
oomInfos: map[string]*oomparser.ContainerOomInfo{},
containerLabelFunc: f,
oomRetainDuration: oomRetainDuration,
}

machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
Expand Down Expand Up @@ -247,6 +255,7 @@ type namespacedContainerName struct {
}

type manager struct {
oomInfos map[string]*oomparser.ContainerOomInfo
containers map[namespacedContainerName]*containerData
containersLock sync.RWMutex
memoryCache *memory.InMemoryCache
Expand All @@ -271,6 +280,8 @@ type manager struct {
rawContainerCgroupPathPrefixWhiteList []string
// List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
containerEnvMetadataWhiteList []string
containerLabelFunc metrics.ContainerLabelsFunc
oomRetainDuration *time.Duration
}

func (m *manager) PodmanContainer(containerName string, query *info.ContainerInfoRequest) (info.ContainerInfo, error) {
Expand Down Expand Up @@ -318,7 +329,7 @@ func (m *manager) Start() error {
return err
}
klog.V(2).Infof("Starting recovery of all containers")
err = m.detectSubcontainers("/")
err = m.detectSubContainers("/")
if err != nil {
return err
}
Expand All @@ -340,6 +351,7 @@ func (m *manager) Start() error {
quitUpdateMachineInfo := make(chan error)
m.quitChannels = append(m.quitChannels, quitUpdateMachineInfo)
go m.updateMachineInfo(quitUpdateMachineInfo)
go m.cleanUpOomInfos()

return nil
}
Expand All @@ -363,6 +375,61 @@ func (m *manager) Stop() error {
return nil
}

func (m *manager) GetOOMInfos() map[string]*oomparser.ContainerOomInfo {
m.containersLock.RLock()
defer m.containersLock.RUnlock()
oomInfos := make(map[string]*oomparser.ContainerOomInfo)
for k, v := range m.oomInfos {
if time.Since(v.TimeOfDeath) > *m.oomRetainDuration {
continue
}
oomInfos[k] = v
}
return oomInfos
}

func (m *manager) cleanUpOomInfos() {
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()

for {
select {
case <-ticker.C:
m.containersLock.Lock()
for k, v := range m.oomInfos {
if time.Since(v.TimeOfDeath) > *m.oomRetainDuration {
delete(m.oomInfos, k)
}
}
m.containersLock.Unlock()
}
}
}

func (m *manager) addOrUpdateOomInfo(cont *containerData, timeOfDeath time.Time) error {
m.containersLock.Lock()
defer m.containersLock.Unlock()

contInfo, err := m.containerDataToContainerInfo(cont, &info.ContainerInfoRequest{
NumStats: 60,
})
if err != nil {
return err
}
if oomInfo, ok := m.oomInfos[contInfo.Id]; ok {
atomic.AddUint64(&oomInfo.OomEvents, 1)
return nil
}
containerLabels := m.containerLabelFunc(contInfo)
newOomInfo := &oomparser.ContainerOomInfo{
MetricLabels: containerLabels,
TimeOfDeath: timeOfDeath,
}
atomic.AddUint64(&newOomInfo.OomEvents, 1)
m.oomInfos[contInfo.Id] = newOomInfo
return nil
}

func (m *manager) destroyCollectors() {
for _, container := range m.containers {
container.perfCollector.Destroy()
Expand Down Expand Up @@ -406,7 +473,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
start := time.Now()

// Check for new containers.
err := m.detectSubcontainers("/")
err := m.detectSubContainers("/")
if err != nil {
klog.Errorf("Failed to detect containers: %s", err)
}
Expand Down Expand Up @@ -1056,7 +1123,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {

// Detect all containers that have been added or deleted from the specified container.
func (m *manager) getContainersDiff(containerName string) (added []info.ContainerReference, removed []info.ContainerReference, err error) {
// Get all subcontainers recursively.
// Get all subContainers recursively.
m.containersLock.RLock()
cont, ok := m.containers[namespacedContainerName{
Name: containerName,
Expand Down Expand Up @@ -1103,8 +1170,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
return
}

// Detect the existing subcontainers and reflect the setup here.
func (m *manager) detectSubcontainers(containerName string) error {
// Detect the existing subContainers and reflect the setup here.
func (m *manager) detectSubContainers(containerName string) error {
added, removed, err := m.getContainersDiff(containerName)
if err != nil {
return err
Expand Down Expand Up @@ -1147,7 +1214,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
}

// There is a race between starting the watch and new container creation so we do a detection before we read new containers.
err := m.detectSubcontainers("/")
err := m.detectSubContainers("/")
if err != nil {
return err
}
Expand Down Expand Up @@ -1247,7 +1314,9 @@ func (m *manager) watchForNewOoms() error {
continue
}
for _, cont := range conts {
atomic.AddUint64(&cont.oomEvents, 1)
if err := m.addOrUpdateOomInfo(cont, oomInstance.TimeOfDeath); err != nil {
klog.Errorf("failed to add OOM info for %q: %v", oomInstance.ContainerName, err)
}
}
}
}()
Expand Down
50 changes: 49 additions & 1 deletion manager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ import (
info "github.com/google/cadvisor/info/v1"
itest "github.com/google/cadvisor/info/v1/test"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/metrics"
"github.com/google/cadvisor/utils/oomparser"
"github.com/google/cadvisor/utils/sysfs/fakesysfs"

"github.com/stretchr/testify/assert"
clock "k8s.io/utils/clock/testing"

Expand Down Expand Up @@ -531,3 +532,50 @@ func TestDockerContainersInfo(t *testing.T) {
t.Errorf("expected error %q but received %q", expectedError, err)
}
}

func Test_addOrUpdateOomInfo(t *testing.T) {
m := manager{
oomInfos: map[string]*oomparser.ContainerOomInfo{},
memoryCache: memory.New(2*time.Minute, nil),
containerLabelFunc: metrics.DefaultContainerLabels,
}
contReference := info.ContainerReference{
Name: "testcontainer",
Id: "testcontainer1",
}
contInfo := &info.ContainerInfo{
ContainerReference: contReference,
}
spec := info.ContainerSpec{}
mockHandler := containertest.NewMockContainerHandler("testcontainer")
mockHandler.On("GetSpec").Return(
spec,
nil,
).Times(2)
mockHandler.On("ListContainers", container.ListSelf).Return(
[]info.ContainerReference{contReference},
nil,
).Times(2)
collectorManager, _ := collector.NewCollectorManager()
cont := &containerData{
info: containerInfo{
ContainerReference: contReference,
},
handler: mockHandler,
collectorManager: collectorManager,
infoLastUpdatedTime: time.Now(),
clock: clock.NewFakeClock(time.Now()),
}
m.memoryCache.AddStats(contInfo, &info.ContainerStats{
Timestamp: time.Now(),
})

err := m.addOrUpdateOomInfo(cont, time.Now())
assert.NoError(t, err)
assert.Equal(t, 1, len(m.oomInfos))
assert.Equal(t, uint64(1), m.oomInfos["testcontainer1"].OomEvents)

// update oom info, increase oom event counts
assert.NoError(t, m.addOrUpdateOomInfo(cont, time.Now()))
assert.Equal(t, uint64(2), m.oomInfos["testcontainer1"].OomEvents)
}
3 changes: 3 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

info "github.com/google/cadvisor/info/v1"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/utils/oomparser"
)

// metricValue describes a single metric value for a given set of label values
Expand All @@ -39,4 +40,6 @@ type infoProvider interface {
GetVersionInfo() (*info.VersionInfo, error)
// GetMachineInfo provides information about the machine.
GetMachineInfo() (*info.MachineInfo, error)
// GetOOMInfos provides information about OOMs.
GetOOMInfos() map[string]*oomparser.ContainerOomInfo
}
Loading