Conversation
@@ -31,6 +31,8 @@ const configFilePathArgName = "config" | |||
|
|||
// ContainerdConfig contains config related to containerd | |||
type ContainerdConfig struct { | |||
// ContainerdRootDir is the root directory path for containerd. | |||
ContainerdRootDir string |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
toml
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do in my PR. Thanks!
pkg/server/container_stats.go
Outdated
"k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" | ||
) | ||
|
||
// ContainerStats returns stats of the container. If the container does not | ||
// exist, the call returns an error. | ||
func (c *criContainerdService) ContainerStats(ctx context.Context, in *runtime.ContainerStatsRequest) (*runtime.ContainerStatsResponse, error) { | ||
return nil, errors.New("not implemented") | ||
// Validate the stats request | ||
if in == nil || in.ContainerId == "" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: in.GetContainerId() == ""
Not important, just a nit.
pkg/server/container_stats_list.go
Outdated
cs.Cpu = &runtime.CpuUsage{Timestamp: stats.Timestamp.Unix(), UsageCoreNanoSeconds: &runtime.UInt64Value{Value: metrics.CPU.Usage.Total}} | ||
cs.Memory = &runtime.MemoryUsage{Timestamp: stats.Timestamp.Unix(), WorkingSetBytes: &runtime.UInt64Value{metrics.Memory.Usage.Usage}} | ||
|
||
cs.Attributes = &runtime.ContainerAttributes{Id: stats.ID} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please fill the attributes, all required information is in c.containerStore.Get(in.ContainerId)
, which you have called.
pkg/server/container_stats.go
Outdated
|
||
var cs runtime.ContainerStats | ||
if err := getContainerMetrics(resp.Metrics[0], &cs); err != nil { | ||
return nil, fmt.Errorf("failed to decode container metrics for container %q: %v", err) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unused %q. I'm fine with removing it directly.
pkg/server/container_stats_list.go
Outdated
metrics := s.(*cgroups.Metrics) | ||
cs.Cpu = &runtime.CpuUsage{Timestamp: stats.Timestamp.Unix(), UsageCoreNanoSeconds: &runtime.UInt64Value{Value: metrics.CPU.Usage.Total}} | ||
cs.Memory = &runtime.MemoryUsage{Timestamp: stats.Timestamp.Unix(), WorkingSetBytes: &runtime.UInt64Value{metrics.Memory.Usage.Usage}} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also get writable layer size from snapshots. See the imagefs stats PR.
pkg/server/container_stats_list.go
Outdated
return fmt.Errorf("failed to extract container metrics: %v", err) | ||
} | ||
metrics := s.(*cgroups.Metrics) | ||
cs.Cpu = &runtime.CpuUsage{Timestamp: stats.Timestamp.Unix(), UsageCoreNanoSeconds: &runtime.UInt64Value{Value: metrics.CPU.Usage.Total}} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Seems too long and also the following one.
pkg/server/container_stats_list.go
Outdated
} | ||
func (c *criContainerdService) buildTaskMetricsRequest(r *runtime.ListContainerStatsRequest) (tasks.MetricsRequest, error) { | ||
var req tasks.MetricsRequest | ||
if r == nil || r.Filter == nil { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: r.GetFilter().GetId() and r.GetFilter().GetPodSandboxId()
These getters could help you avoid the nil check.
pkg/server/container_stats_list.go
Outdated
var req tasks.MetricsRequest | ||
if r == nil || r.Filter == nil { | ||
return req, nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the relationship between different filters in CRI is AND
.
What about containerd?
- If different filters in containerd is
OR
, then it's not correct. - If different filters in containerd is
AND
, then we'll get no container, because there is no single container has all the ids.
As is mentioned in our discussion, I'm fine with list all containerstats and do filter afterwards, if the filter doesn't work that well :) Up to you to decide.
integration/container_stats_test.go
Outdated
) | ||
|
||
const ( | ||
defaultImage = "gcr.io/google_containers/pause:3.0" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This const is global, let's make the name more specific.
integration/container_stats_test.go
Outdated
s, err := runtimeService.ContainerStats(sb) | ||
t.Logf("Verify stats received for sandbox container") | ||
require.NoError(t, err) | ||
testStats(t, s) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this work? We don't store sandbox container id in container store, right?
pkg/server/container_stats_list.go
Outdated
if err != nil { | ||
return nil, fmt.Errorf("failed to build metrics request: %v", err) | ||
} | ||
resp, err := c.taskService.Metrics(context.Background(), &request) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may want to add a filter to filter containerKindLabel=container
. This will also include sandbox containers in the list.
I need to discuss with @yujuhong to finalize whether we should include sandbox container stats in ListContainerStats
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Discussed with @yujuhong, let's exclude the sandbox container for now. We need to figure out a way to handle pod level overhead.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok makes sense. I wasn't sure about it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Circling back again. If we exclude sandbox container. The aggregated stats for a pod might be misleading right ? Or is the reason that the stats for sandbox containers is negligible and consistent for all pods so it doesnt factor into decision making ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@abhinandanpb This is a problem in CRI now.
We have pod level cgroup, and ideally Kubelet should make resource management decision based on resource usage got from pod level cgroup. And the container metrics are only for user who cares about how much resource their workloads (application containers) are using.
However, today Kubelet is not using pod level stats yet. It is still making decision based on container metrics. This means that the problem you mentioned does exist, kubelet is not looking at all resource usage inside the pod (sandbox container, container-shim etc.).
This is an issue needs to be addressed. We should:
- Either change Kubelet to make resource management decision based on pod level stats, which include everything inside the pod cgroup.
- Or change CRI to include pod sandbox extra overhead.
1 is preferable now, and we'll probably do it next Quarter.
For now, we could only provide application container metrics based on current CRI. We'll address this in 1.9. We are going to refactor cadvisor, and pod cgroup metrics will be part of the effort.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it. Thanks for the explaination.
integration/container_stats_test.go
Outdated
} | ||
} | ||
|
||
func testStats(t *testing.T, s *runtime.ContainerStats) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will this be flaky? Is it possible that a container gets 0 cpu seconds? Do we need to use Eventually
to check this?
Just my question. I'm not sure about the answer. :) I think 0 cpu seconds shouldn't be possible, but you may have some more ideas. :)
integration/container_stats_test.go
Outdated
} | ||
|
||
func testStats(t *testing.T, s *runtime.ContainerStats) { | ||
require.NotEmpty(t, s.Attributes) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Use Getter
, which will save many lines. :)
Getter
already handles the case that the original object is nil, it will just return zero value.
pkg/server/container_stats.go
Outdated
"k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" | ||
) | ||
|
||
// ContainerStats returns stats of the container. If the container does not | ||
// exist, the call returns an error. | ||
func (c *criContainerdService) ContainerStats(ctx context.Context, in *runtime.ContainerStatsRequest) (*runtime.ContainerStatsResponse, error) { | ||
return nil, errors.New("not implemented") | ||
// Validate the stats request | ||
if in == nil || in.GetContainerId() == "" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will get rid of this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: in.GetContainerId() == ""
is enough
@abhinandanpb Dependent PR is merged. Please rebase. |
83f1fe2
to
1a98921
Compare
pkg/server/container_stats.go
Outdated
|
||
var cs runtime.ContainerStats | ||
if err := c.getContainerMetrics(resp.Metrics[0], &cs); err != nil { | ||
return nil, fmt.Errorf("failed to decode container metrics %v", err) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Add :
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
pkg/server/container_stats_list.go
Outdated
var usedBytes, inodesUsed uint64 | ||
sn, err := c.snapshotStore.Get(stats.ID) | ||
// If snapshotstore doesnt have cached snapshotStore | ||
// set WritableLayer to zero |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: WritableLayer usage.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
pkg/server/container_stats_list.go
Outdated
|
||
var usedBytes, inodesUsed uint64 | ||
sn, err := c.snapshotStore.Get(stats.ID) | ||
// If snapshotstore doesnt have cached snapshotStore |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: s/doesnt/doesn't
nit: cached snapshot information.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
pkg/server/container_stats_list.go
Outdated
usedBytes = sn.Inodes | ||
} | ||
cs.WritableLayer = &runtime.FilesystemUsage{ | ||
Timestamp: stats.Timestamp.Unix(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should use the timestamp in the snapshot information.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
pkg/server/container_stats_list.go
Outdated
// the information in the stats request and the containerStore | ||
func (c *criContainerdService) buildTaskMetricsRequest(r *runtime.ListContainerStatsRequest) (tasks.MetricsRequest, error) { | ||
var req tasks.MetricsRequest | ||
if r == nil || r.Filter == nil { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: r.GetFilter() == nil
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
pkg/server/container_stats_list.go
Outdated
return nil | ||
} | ||
|
||
// buildTaskMetricsRequest constructs a taskMetricsRequest based on |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: task.MetricsRequest.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
:) done
pkg/server/container_stats_list.go
Outdated
return req, nil | ||
} | ||
|
||
func matchLabelSelector(selector map[string]string, labels map[string]string) bool { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: selector, labels map[string]string
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
pkg/server/container_stats_list.go
Outdated
} | ||
|
||
// Get the container from store and extract the attributes | ||
cnt, err := c.containerStore.Get(stats.ID) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's possible that a container shows up during list, but get deleted before this point. In that case, we should not return error for the whole ListContainerStats
.
We should either pass in the container config into this function, or distinguish not found error here.
The former seems cleaner to me.
pkg/server/container_stats.go
Outdated
"k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" | ||
) | ||
|
||
// ContainerStats returns stats of the container. If the container does not | ||
// exist, the call returns an error. | ||
func (c *criContainerdService) ContainerStats(ctx context.Context, in *runtime.ContainerStatsRequest) (*runtime.ContainerStatsResponse, error) { | ||
return nil, errors.New("not implemented") | ||
// Validate the stats request | ||
if in == nil || in.GetContainerId() == "" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: in.GetContainerId() == ""
is enough
integration/container_stats_test.go
Outdated
) | ||
|
||
const ( | ||
defaultTestImage = "gcr.io/google_containers/pause:3.0" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Either add pauseImage
in test_utils.go as a utility, or make this variable specific containerStatsTestImage
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
integration/container_stats_test.go
Outdated
assert.NoError(t, runtimeService.RemovePodSandbox(sb)) | ||
}() | ||
t.Logf("Create a container config and run container in a pod") | ||
opt := func() ContainerOpts { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Add helper function for labels and annotations.
s, err = runtimeService.ContainerStats(cn) | ||
if err != nil { | ||
return false, err | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Just testStats
here.
if err != nil { | ||
return false, err | ||
} | ||
for _, s := range stats { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: testStats
here directly.
integration/container_stats_test.go
Outdated
testStats(t, s) | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Each of the filter test takes 10 seconds. It means that the 3 filter tests take 30 seconds. I feel like a unit test for buildTaskMetricsRequest
is good enough given that we don't actually use these filters in CRI. And in unit test we could cover more cases, label filtering, other combination etc.
We could keep one filter test here to make sure containerd.Metrics
filtering does work.
WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thought process here is that its fine not to be conservative with tests especially with this scenario not being exercised by e2e tests. Also this would cover the basic combinations between the 2 components. Unit test is good to test the filtering I agree. I would like to add a TODO here to take this off once we have e2e coverage for these functionalities. Always happy to remove code :) Let me know.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like that we should:
- Add comprehensive unit test for filtering no matter we test filter or not here.
- There are many duplicated code in this file, we could easily use a table-driven test pattern to get rid of these duplication.
However, I don't want to block this PR because of test code, so I'm fine with adding TODO for now, and come back later.
integration/container_stats_test.go
Outdated
} | ||
} | ||
|
||
func testStats(t *testing.T, s *runtime.ContainerStats) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pass in container config for verification? Even for ListContainerStats
, we could find corresponding container config inside the list and pass in.
@Random-Liu addressed comments. PTAL. I have added to TODO to clean up the tests which I will raise a PR in a day. |
cc071de
to
39de552
Compare
pkg/server/container_stats_list.go
Outdated
"errors" | ||
|
||
"fmt" | ||
"github.com/golang/glog" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move to the next group.
pkg/server/container_stats_list.go
Outdated
if err != nil { | ||
return nil, fmt.Errorf("failed to build metrics request: %v", err) | ||
} | ||
resp, err := c.taskService.Metrics(context.Background(), &request) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pass ctx
in.
pkg/server/container_stats_list.go
Outdated
delete(candidateContainers, stat.ID) | ||
containerStats.Stats = append(containerStats.Stats, &cs) | ||
} | ||
// If there is a transient state where containers are dead at the time of query |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not a transient state. For any dead container, this will happen until we remove the container.
pkg/server/container_stats_list.go
Outdated
containerStats.Stats = append(containerStats.Stats, &cs) | ||
} | ||
// If there is a transient state where containers are dead at the time of query | ||
// but present in the containerStore , then check if the writeableLayer information |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
get rid of the space before ,
.
pkg/server/container_stats_list.go
Outdated
stats *types.Metric, | ||
cs *runtime.ContainerStats, | ||
) error { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unnecessary empty line.
LGTM overall with some nits. |
}() | ||
t.Logf("Create a container config and run containers in a pod") | ||
containerConfigMap := make(map[string]*runtime.ContainerConfig) | ||
for i := 0; i < 3; i++ { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should have a dead container in the test, and for dead container, testStats
won't work.
Probably add a TODO here, and you may want to manually validate it first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
had added on top of testStats coz that needed change. I have manually verified the case for dead containers by introducing a stopContainer and checking the stats.
=== RUN TestContainerListStatsDead
&ContainerStats{Attributes:&ContainerAttributes{Id:6bf3066a1929839b5a5802cf3b3b49c7c292cc53364b3280d2a0a07dc24d9f77,Metadata:&ContainerMetadata{Name:container0,Attempt:0,},Labels:map[string]string{key: value,},Annotations:map[string]string{a.b.c: test,},},Cpu:nil,Memory:nil,WritableLayer:&FilesystemUsage{Timestamp:1506452046974590603,StorageId:&StorageIdentifier{Uuid:c656051a-9391-40fe-8333-57b410884232,},UsedBytes:&UInt64Value{Value:7,},InodesUsed:&UInt64Value{Value:20480,},},}
&ContainerStats{Attributes:&ContainerAttributes{Id:178a2722bd4ecec8728a952f07474b7714896f4afe8d69b3b58989b387e45804,Metadata:&ContainerMetadata{Name:container1,Attempt:0,},Labels:map[string]string{key: value,},Annotations:map[string]string{a.b.c: test,},},Cpu:nil,Memory:nil,WritableLayer:&FilesystemUsage{Timestamp:1506452046972987825,StorageId:&StorageIdentifier{Uuid:c656051a-9391-40fe-8333-57b410884232,},UsedBytes:&UInt64Value{Value:7,},InodesUsed:&UInt64Value{Value:20480,},},}
&ContainerStats{Attributes:&ContainerAttributes{Id:52e891cd436a10d71bc37d71f978d4142c4d49b7922ca7a2442ca75fe15c10b8,Metadata:&ContainerMetadata{Name:container2,Attempt:0,},Labels:map[string]string{key: value,},Annotations:map[string]string{a.b.c: test,},},Cpu:nil,Memory:nil,WritableLayer:&FilesystemUsage{Timestamp:1506452046973408844,StorageId:&StorageIdentifier{Uuid:c656051a-9391-40fe-8333-57b410884232,},UsedBytes:&UInt64Value{Value:7,},InodesUsed:&UInt64Value{Value:20480,},},}
--- PASS: TestContainerListStatsDead (32.88s)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
c8be4d4
to
49aa6f9
Compare
Signed-off-by: Abhinandan Prativadi <abhi@docker.com>
Signed-off-by: Abhinandan Prativadi <abhi@docker.com>
LGTM |
Built on top of #257
Review by commits
This commit does not include caching. Caching will be implemented if the benchmark results go northwards.