Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/container-metrics #121

Draft
wants to merge 22 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
50a645d
scrape container metrics from cadvisor
kenanfarukcakir Mar 19, 2024
c47e6e1
add go build cache
kenanfarukcakir Mar 19, 2024
6d8c0bc
get container metrics as io reader
kenanfarukcakir Mar 20, 2024
28a5859
get running processes on gpu
kenanfarukcakir Mar 20, 2024
12cb076
update alaz clusterrole
kenanfarukcakir Mar 20, 2024
3fb7311
inject k8sCollector to datastore
kenanfarukcakir Mar 20, 2024
00b481c
add env var for container metrics
kenanfarukcakir Mar 20, 2024
2792b9c
Merge branch 'feat/logs', remote-tracking branch 'origin' into feat/c…
kenanfarukcakir Mar 21, 2024
4def831
get container info from pid
kenanfarukcakir Mar 21, 2024
4b4be51
Merge branch 'feat/logs' into feat/container-metrics
kenanfarukcakir Mar 21, 2024
16130cc
add container_used_gpu_memory metric
kenanfarukcakir Mar 21, 2024
2eda507
parse hierarchies according to cgroup versions
kenanfarukcakir Mar 25, 2024
3c24843
Merge remote-tracking branch 'origin/develop' into feat/container-met…
kenanfarukcakir Mar 25, 2024
e980444
add CONTAINER_METRICS_ENABLED env
kenanfarukcakir Mar 25, 2024
c6ed4ce
Merge remote-tracking branch 'origin' into feat/container-metrics
kenanfarukcakir Apr 3, 2024
8169325
Merge remote-tracking branch 'origin/develop' into feat/container-met…
kenanfarukcakir Apr 3, 2024
fd6dda3
add ctx to cri tool
kenanfarukcakir Apr 3, 2024
e7c09c4
implement FilteredReader to filter prometheus container metrics
kenanfarukcakir Apr 3, 2024
2a69222
remove some logs
kenanfarukcakir Apr 3, 2024
15b25e8
filter out metrics with empty container
kenanfarukcakir Apr 4, 2024
4648b45
send container metrics in a different call
kenanfarukcakir Apr 4, 2024
88478b2
apply EXCLUDE_NAMESPACES to container metrics
kenanfarukcakir Apr 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Dockerfile.default
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
FROM golang:1.22.1-bullseye as builder
WORKDIR /app
COPY . ./
RUN apt update
RUN go mod download

ARG VERSION
ENV GOCACHE=/root/.cache/go-build

ARG VERSION
RUN go mod tidy -v
RUN --mount=type=cache,target="/root/.cache/go-build" GOOS=linux go build -ldflags="-X 'github.com/ddosify/alaz/datastore.tag=$VERSION'" -o alaz

Expand Down
9 changes: 5 additions & 4 deletions config/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ type PostgresConfig struct {
}

type BackendDSConfig struct {
Host string
MetricsExport bool
GpuMetricsExport bool
MetricsExportInterval int // in seconds
Host string
NodeMetricsExport bool
GpuMetricsExport bool
ContainerMetricsExport bool
MetricsExportInterval int // in seconds

ReqBufferSize int
ConnBufferSize int
Expand Down
182 changes: 168 additions & 14 deletions cri/cri.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ import (
"encoding/json"
"fmt"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
"time"

"github.com/ddosify/alaz/log"
Expand All @@ -26,14 +28,40 @@ var defaultRuntimeEndpoints = []string{"unix:///proc/1/root/run/containerd/conta
"unix:///proc/1/root/run/cri-dockerd.sock", "unix:///proc/1/root/var/run/cri-dockerd.sock"}

type ContainerPodInfo struct {
PodUid string
PodName string
PodNs string
PodUid string
PodName string
PodNs string
ContainerName string
}

type CRITool struct {
rs internalapi.RuntimeService
nsFilterRx *regexp.Regexp
ctx context.Context
}

// parse podID and containerID from /proc/<pid>/cgroup file
var parseCgroupFunc func(string) (string, string, error)

func init() {
parseCgroupFunc = parseCgroupV1
cmd := exec.Command("stat", "-fc", "%T", "/sys/fs/cgroup/")
output, err := cmd.CombinedOutput()
if err != nil {
log.Logger.Warn().Msg("Unable to find cgroup version: %s, assuming v1")
return
}

fsType := strings.TrimSuffix(string(output), "\n")
switch fsType {
case "tmpfs":
parseCgroupFunc = parseCgroupV1
case "cgroup2fs":
parseCgroupFunc = parseCgroupV2
default:
log.Logger.Warn().Msgf("Unknown filesystem type for cgroups: %s, assuming v1", fsType)
parseCgroupFunc = parseCgroupV1
}
}

func NewCRITool(ctx context.Context) (*CRITool, error) {
Expand Down Expand Up @@ -62,6 +90,7 @@ func NewCRITool(ctx context.Context) (*CRITool, error) {
return &CRITool{
rs: res,
nsFilterRx: nsFilterRx,
ctx: ctx,
}, nil
}

Expand Down Expand Up @@ -104,7 +133,7 @@ func (ct *CRITool) GetAllContainers() ([]*pb.Container, error) {
XXX_sizecache: 0,
}

list, err := ct.rs.ListContainers(context.TODO(), filter)
list, err := ct.rs.ListContainers(ct.ctx, filter)
if err != nil {
return nil, err
}
Expand All @@ -126,7 +155,7 @@ func (ct *CRITool) GetPidsRunningOnContainers() (map[uint32]struct{}, error) {
XXX_sizecache: 0,
}

list, err := ct.rs.ListContainers(context.TODO(), filter)
list, err := ct.rs.ListContainers(ct.ctx, filter)
if err != nil {
return nil, err
}
Expand All @@ -151,7 +180,7 @@ func (ct *CRITool) GetPidsRunningOnContainers() (map[uint32]struct{}, error) {
}

func (ct *CRITool) getAllRunningProcsInsideContainer(containerID string) ([]uint32, error) {
r, err := ct.rs.ContainerStatus(context.TODO(), containerID, true)
r, err := ct.rs.ContainerStatus(ct.ctx, containerID, true)
if err != nil {
log.Logger.Error().Err(err).Msgf("Failed to get container status for container %s", containerID)
return nil, err
Expand Down Expand Up @@ -232,7 +261,7 @@ func (ct *CRITool) GetLogPath(id string) (string, error) {
return "", fmt.Errorf("containerID cannot be empty")
}

r, err := ct.rs.ContainerStatus(context.TODO(), id, true)
r, err := ct.rs.ContainerStatus(ct.ctx, id, true)
if err != nil {
return "", err
}
Expand All @@ -243,24 +272,54 @@ func (ct *CRITool) GetLogPath(id string) (string, error) {
return fmt.Sprintf("/proc/1/root%s", r.Status.LogPath), nil
}

type ContainerInfo struct {
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name"`
PodID string `json:"pod_id"`
PodName string `json:"pod_name"`
PodNamespace string `json:"pod_namespace"`
}

func (ct *CRITool) GetContainerInfoWithPid(pid uint32) (*ContainerInfo, error) {
podID, containerID, err := parseCgroupFunc(fmt.Sprintf("/proc/%d/cgroup", pid))
if err != nil {
return nil, fmt.Errorf("could not parse cgroup info for pid %d: %v", pid, err)
}

info, err := ct.ContainerStatus(containerID)
if err != nil {
return nil, fmt.Errorf("could not get container info with id %s", containerID)
}

return &ContainerInfo{
PodID: podID,
ContainerID: containerID,
PodName: info.PodName,
PodNamespace: info.PodNs,
ContainerName: info.ContainerName,
}, nil
}

func (ct *CRITool) ContainerStatus(id string) (*ContainerPodInfo, error) {
if id == "" {
return nil, fmt.Errorf("ID cannot be empty")
}

verbose := true

r, err := ct.rs.ContainerStatus(context.TODO(), id, verbose)
r, err := ct.rs.ContainerStatus(ct.ctx, id, verbose)
if err != nil {
return nil, err
}

containerName := r.Status.Metadata.Name

info := map[string]interface{}{}
json.Unmarshal([]byte(r.Info["info"]), &info)

sandBoxID := info["sandboxID"].(string)

podRes, err := ct.rs.PodSandboxStatus(context.TODO(), sandBoxID, verbose)
podRes, err := ct.rs.PodSandboxStatus(ct.ctx, sandBoxID, verbose)
if err != nil {
return nil, err
}
Expand All @@ -270,9 +329,10 @@ func (ct *CRITool) ContainerStatus(id string) (*ContainerPodInfo, error) {
podNamespace := podRes.Status.Metadata.Namespace

return &ContainerPodInfo{
PodUid: podUid,
PodName: podName,
PodNs: podNamespace,
PodUid: podUid,
PodName: podName,
PodNs: podNamespace,
ContainerName: containerName,
}, nil
}

Expand All @@ -290,7 +350,7 @@ func (ct *CRITool) getContainersOfPod(podSandboxId string) ([]*pb.Container, err
XXX_sizecache: 0,
}

list, err := ct.rs.ListContainers(context.TODO(), filter)
list, err := ct.rs.ListContainers(ct.ctx, filter)
if err != nil {
return nil, err
}
Expand All @@ -312,5 +372,99 @@ func (ct *CRITool) getPod(podUid string) ([]*pb.PodSandbox, error) {
}
filter.State = st

return ct.rs.ListPodSandbox(context.Background(), filter)
return ct.rs.ListPodSandbox(ct.ctx, filter)
}

func parseCgroupV1(filePath string) (string, string, error) {
log.Logger.Debug().Msgf("Parsing cgroup v1 file: %s", filePath)
file, err := os.Open(filePath)
if err != nil {
return "", "", err
}
defer file.Close()

scanner := bufio.NewScanner(file)
for scanner.Scan() {
// 1:name=systemd:/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod57009a28_e677_4550_8e14_7724a18cc70c.slice/cri-containerd-62b0b04c0d518199a25d7cd859c376caf71a850374ce1d76fc7410e54dd63a10.scope
line := scanner.Text()

// split the lines based on the first occurrence of ':'; this will leave us with cgroup version and rest of the info separately
parts := strings.SplitN(line, ":", 3)

if len(parts) == 3 {
values := strings.Split(parts[2], "/")
if len(values) < 5 {
return "", "", fmt.Errorf("unexpected cgroup format")
}

// Value 3: kubepods-burstable-pod57009a28_e677_4550_8e14_7724a18cc70c.slice
// Value 4: cri-containerd-62b0b04c0d518199a25d7cd859c376caf71a850374ce1d76fc7410e54dd63a10.scope

podInfo := values[len(values)-2]
containerInfo := values[len(values)-1]

podIndex := strings.LastIndex(podInfo, "pod")
sliceIndex := strings.Index(podInfo, ".slice")
podID := podInfo[podIndex+3 : sliceIndex]

containerDashIndex := strings.LastIndex(containerInfo, "-")
scopeIndex := strings.Index(containerInfo, ".scope")
containerID := containerInfo[containerDashIndex+1 : scopeIndex]

return podID, containerID, nil
}
}

if scanner.Err() != nil {
return "", "", scanner.Err()
}

return "", "", fmt.Errorf("unable to find cgroup info")
}

func parseCgroupV2(filePath string) (string, string, error) {
log.Logger.Debug().Msgf("Parsing cgroup v2 file: %s", filePath)
file, err := os.Open(filePath)
if err != nil {
return "", "", err
}
defer file.Close()

scanner := bufio.NewScanner(file)
for scanner.Scan() {
// 0::/system.slice/kubepods-burstable-pod22fd933a_ee61_46bd_93ae_ebace73c1160.slice:cri-containerd:3a31a360e5aea903274416e0c4cf8ca8c050fd523cc5d91a82ec5707d5ee9fa1
line := scanner.Text()

// split the lines based on the first occurrence of ':'; this will leave us with cgroup version and rest of the info separately
parts := strings.SplitN(line, ":", 4)

// cgroup v2 will have 0 in the first part
if parts[0] != "0" {
continue
}

// /system.slice/kubepods-burstable-pod22fd933a_ee61_46bd_93ae_ebace73c1160.slice
podInfo := parts[len(parts)-2]
values := strings.Split(podInfo, "/")

// kubepods-burstable-pod22fd933a_ee61_46bd_93ae_ebace73c1160
podInfoLastPart := values[len(values)-1]
podIndex := strings.LastIndex(podInfoLastPart, "pod")
sliceIndex := strings.Index(podInfoLastPart, ".slice")

podID := podInfoLastPart[podIndex+3 : sliceIndex]

// cri-containerd:3a31a360e5aea903274416e0c4cf8ca8c050fd523cc5d91a82ec5707d5ee9fa1
containerInfo := parts[len(parts)-1]
values = strings.Split(containerInfo, ":")
containerID := values[len(values)-1]

return podID, containerID, nil
}

if scanner.Err() != nil {
return "", "", scanner.Err()
}

return "", "", fmt.Errorf("unable to find cgroup info")
}
Loading
Loading