From 495528bd96d5398cf5f454d27c1bd078d494e2be Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Tue, 29 Jun 2021 16:34:38 +0000 Subject: [PATCH 1/6] [agent-smith] account for egress traffic --- components/ee/agent-smith/pkg/agent/agent.go | 153 +++++++++++++++--- .../ee/agent-smith/pkg/network/egress.go | 59 +++++++ 2 files changed, 190 insertions(+), 22 deletions(-) create mode 100644 components/ee/agent-smith/pkg/network/egress.go diff --git a/components/ee/agent-smith/pkg/agent/agent.go b/components/ee/agent-smith/pkg/agent/agent.go index d1cba7ba6fc018..399e4f268c35bc 100644 --- a/components/ee/agent-smith/pkg/agent/agent.go +++ b/components/ee/agent-smith/pkg/agent/agent.go @@ -15,10 +15,13 @@ import ( "sort" "strconv" "strings" + "syscall" + "time" "unsafe" "github.com/cilium/ebpf/perf" "github.com/gitpod-io/gitpod/agent-smith/pkg/bpf" + "github.com/gitpod-io/gitpod/agent-smith/pkg/network" "github.com/gitpod-io/gitpod/agent-smith/pkg/signature" "github.com/gitpod-io/gitpod/common-go/log" "github.com/gitpod-io/gitpod/common-go/util" @@ -43,10 +46,12 @@ type Smith struct { Config Config GitpodAPI gitpod.APIInterface EnforcementRules map[string]EnforcementRules + EgressTraffic *EgressTraffic metrics *metrics notifiedInfringements *lru.Cache perfHandler chan perfHandlerFunc + pids map[int]time.Time } // EgressTraffic configures an upper limit of allowed egress traffic over time @@ -303,7 +308,7 @@ func (er EnforcementRules) Validate() error { } // Start gets a stream of Infringements from Run and executes a callback on them to apply a Penalty -func (agent *Smith) Start(callback func(InfringingWorkspace, []PenaltyKind)) { +func (agent *Smith) Start(ctx context.Context, callback func(InfringingWorkspace, []PenaltyKind)) { // todo(fntlnz): do the bpf loading here before running Run so that we have everything sorted out abpf, err := bpf.LoadAndAttach(agent.Config.ProbePath) @@ -313,37 +318,68 @@ func (agent *Smith) Start(callback func(InfringingWorkspace, []PenaltyKind)) { defer abpf.Close() + agent.cleanupDeadPIDS(ctx) + + egressTicker := time.NewTicker(30 * time.Second) + for i := 0; i < 10; i++ { go func(i int) { - for h := range agent.perfHandler { - if h == nil { - continue - } + for { + select { + case <-egressTicker.C: + for p, t := range agent.pids { + infr, err := agent.checkEgressTrafficCallback(strconv.Itoa(p), t) + if err != nil { + log.WithError(err).Warn("error checking egress for pid: %d", p) + continue + } + v, err := getWorkspaceFromProcess(p) + if err != nil { + log.WithError(err).Warn("error getting workspace from process with pid: %d", p) + continue + } + v.Infringements = append(v.Infringements, *infr) + ps, err := agent.Penalize(*v) + if err != nil { + log.WithError(err).WithField("infringement", v).Warn("error while reacting to infringement") + } + alreadyNotified, _ := agent.notifiedInfringements.ContainsOrAdd(v.VID(), nil) + if alreadyNotified { + continue + } + callback(*v, ps) + } + case h := <-agent.perfHandler: + if h == nil { + continue + } - v, err := h() - if err != nil { - log.WithError(err).Warn("error while running perf handler") - } + v, err := h() + if err != nil { + log.WithError(err).Warn("error while running perf handler") + } - // event did not generate an infringement - if v == nil { - continue - } - ps, err := agent.Penalize(*v) - if err != nil { - log.WithError(err).WithField("infringement", v).Warn("error while reacting to infringement") - } + // event did not generate an infringement + if v == nil { + continue + } + ps, err := agent.Penalize(*v) + if err != nil { + log.WithError(err).WithField("infringement", v).Warn("error while reacting to infringement") + } - alreadyNotified, _ := agent.notifiedInfringements.ContainsOrAdd(v.VID(), nil) - if alreadyNotified { - continue + alreadyNotified, _ := agent.notifiedInfringements.ContainsOrAdd(v.VID(), nil) + if alreadyNotified { + continue + } + callback(*v, ps) + case <-ctx.Done(): + return } - callback(*v, ps) } }(i) } - // todo(fntlnz): use a channel to cancel this execution for { rec, err := abpf.Read() if err != nil { @@ -357,6 +393,36 @@ func (agent *Smith) Start(callback func(InfringingWorkspace, []PenaltyKind)) { } } +func (agent *Smith) cleanupDeadPIDS(ctx context.Context) { + ticker := time.NewTicker(30 * time.Second) + go func() { + for { + select { + case <-ticker.C: + agent.cleanupDeadPidsCallback() + case <-ctx.Done(): + ticker.Stop() + return + } + } + }() +} + +func (agent *Smith) cleanupDeadPidsCallback() { + for p, _ := range agent.pids { + process, _ := os.FindProcess(p) + if process == nil { + delete(agent.pids, p) + continue + } + + err := process.Signal(syscall.Signal(0)) + if err != nil { + delete(agent.pids, p) + } + } +} + // Penalize acts on infringements and e.g. stops pods func (agent *Smith) Penalize(ws InfringingWorkspace) ([]PenaltyKind, error) { var remoteURL string @@ -526,6 +592,10 @@ func (agent *Smith) processPerfRecord(rec perf.Record) { // handles an execve event checks if it's infringing func (agent *Smith) handleExecveEvent(execve Execve) func() (*InfringingWorkspace, error) { + // this is not the exact process startup time + // but for the type of comparison we need to do is enough + agent.pids[execve.TID] = time.Now() + return func() (*InfringingWorkspace, error) { if agent.Config.Blacklists == nil { return nil, nil @@ -720,3 +790,42 @@ func mergeInfringingWorkspaces(vws []InfringingWorkspace) (vw InfringingWorkspac func (agent *Smith) RegisterMetrics(reg prometheus.Registerer) error { return agent.metrics.Register(reg) } + +func (agent *Smith) checkEgressTrafficCallback(pid string, pidCreationTime time.Time) (*Infringement, error) { + if agent.EgressTraffic == nil { + return nil, nil + } + podLifetime := time.Since(pidCreationTime) + resp, err := network.GetEgressTraffic(pid) + if err != nil { + return nil, err + } + if resp <= 0 { + log.WithField("total egress bytes", resp).Warn("GetEgressTraffic returned <= 0 value") + return nil, nil + } + + type level struct { + V GradedInfringementKind + T *PerLevelEgressTraffic + } + levels := make([]level, 0, 2) + if agent.EgressTraffic.VeryExcessiveLevel != nil { + levels = append(levels, level{V: GradeKind(InfringementExcessiveEgress, InfringementSeverityVery), T: agent.EgressTraffic.VeryExcessiveLevel}) + } + if agent.EgressTraffic.ExcessiveLevel != nil { + levels = append(levels, level{V: GradeKind(InfringementExcessiveEgress, InfringementSeverityAudit), T: agent.EgressTraffic.ExcessiveLevel}) + } + + dt := int64(podLifetime / time.Duration(agent.EgressTraffic.WindowDuration)) + for _, lvl := range levels { + allowance := dt*lvl.T.Threshold.Value() + lvl.T.BaseBudget.Value() + excess := resp - allowance + + if excess > 0 { + return &Infringement{Description: fmt.Sprintf("egress traffic is %.3f megabytes over limit", float64(excess)/(1024.0*1024.0)), Kind: lvl.V}, nil + } + } + + return nil, nil +} diff --git a/components/ee/agent-smith/pkg/network/egress.go b/components/ee/agent-smith/pkg/network/egress.go new file mode 100644 index 00000000000000..78d02e515b1120 --- /dev/null +++ b/components/ee/agent-smith/pkg/network/egress.go @@ -0,0 +1,59 @@ +// Copyright (c) 2021 Gitpod GmbH. All rights reserved. +// Licensed under the Gitpod Enterprise Source Code License, +// See License.enterprise.txt in the project root folder. + +package network + +import ( + "encoding/csv" + "fmt" + "io" + "os" + "path" + "strconv" + "strings" +) + +func readDeviceEgress(inpt io.Reader, dev string) (total int64, err error) { + csvreader := csv.NewReader(inpt) + csvreader.Comma = ' ' + csvreader.FieldsPerRecord = 17 + csvreader.TrimLeadingSpace = true + + var totalEgress int64 = -1 + //nolint:errcheck,staticcheck + for rec, err := csvreader.Read(); rec != nil; rec, err = csvreader.Read() { + if len(rec) < 9 { + continue + } + if !strings.HasPrefix(rec[0], dev) { + continue + } + + totalEgress, err = strconv.ParseInt(rec[9], 10, 64) + if err != nil { + return 0, err + } + break + } + if totalEgress < 0 { + return 0, fmt.Errorf("did not find interface") + } + + return totalEgress, nil +} + +func GetEgressTraffic(pid string) (int64, error) { + file, err := os.OpenFile(path.Join("/proc", pid, "/net/dev"), os.O_RDONLY, 0600) + if err != nil { + return 0, err + } + defer file.Close() + + totalEgress, err := readDeviceEgress(file, "eth0") + if err != nil { + return 0, err + } + + return totalEgress, nil +} From 66d7ec0b22f93463a441e942499146d06e0bdfc4 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Tue, 29 Jun 2021 16:47:41 +0000 Subject: [PATCH 2/6] [agent-smith] better signal handling --- components/ee/agent-smith/cmd/run.go | 15 ++++++- components/ee/agent-smith/pkg/agent/agent.go | 42 ++++++++++++------- .../ee/agent-smith/pkg/network/egress.go | 3 ++ 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/components/ee/agent-smith/cmd/run.go b/components/ee/agent-smith/cmd/run.go index b78b6415c3d6bc..970d4040de3b72 100644 --- a/components/ee/agent-smith/cmd/run.go +++ b/components/ee/agent-smith/cmd/run.go @@ -5,6 +5,7 @@ package cmd import ( + "context" "fmt" "net/http" "os" @@ -61,7 +62,11 @@ var runCmd = &cobra.Command{ log.WithError(err).Fatal("cannot register metrics") } - go smith.Start(func(violation agent.InfringingWorkspace, penalties []agent.PenaltyKind) { + ctx := context.Background() + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + go smith.Start(ctx, func(violation agent.InfringingWorkspace, penalties []agent.PenaltyKind) { log.WithField("violation", violation).WithField("penalties", penalties).Info("Found violation") if cfg.SlackWebhooks != nil { @@ -94,7 +99,13 @@ var runCmd = &cobra.Command{ sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) - <-sigChan + + select { + case <-ctx.Done(): + return + case <-sigChan: + return + } }, } diff --git a/components/ee/agent-smith/pkg/agent/agent.go b/components/ee/agent-smith/pkg/agent/agent.go index 399e4f268c35bc..b81b4c83538065 100644 --- a/components/ee/agent-smith/pkg/agent/agent.go +++ b/components/ee/agent-smith/pkg/agent/agent.go @@ -15,6 +15,7 @@ import ( "sort" "strconv" "strings" + "sync" "syscall" "time" "unsafe" @@ -51,7 +52,7 @@ type Smith struct { notifiedInfringements *lru.Cache perfHandler chan perfHandlerFunc - pids map[int]time.Time + pidsMap sync.Map } // EgressTraffic configures an upper limit of allowed egress traffic over time @@ -143,6 +144,7 @@ func NewAgentSmith(cfg Config) (*Smith, error) { notifiedInfringements: notificationCache, perfHandler: make(chan perfHandlerFunc, 10), metrics: newAgentMetrics(), + pidsMap: sync.Map{}, } if cfg.Enforcement.Default != nil { if err := cfg.Enforcement.Default.Validate(); err != nil { @@ -327,16 +329,21 @@ func (agent *Smith) Start(ctx context.Context, callback func(InfringingWorkspace for { select { case <-egressTicker.C: - for p, t := range agent.pids { + agent.pidsMap.Range(func(key, value interface{}) bool { + p := key.(int) + t := value.(time.Time) infr, err := agent.checkEgressTrafficCallback(strconv.Itoa(p), t) if err != nil { - log.WithError(err).Warn("error checking egress for pid: %d", p) - continue + log.WithError(err).Warnf("error checking egress for pid: %d", p) + return true + } + if infr == nil { + return true } v, err := getWorkspaceFromProcess(p) if err != nil { - log.WithError(err).Warn("error getting workspace from process with pid: %d", p) - continue + // this is not from a workspace, let's skip + return true } v.Infringements = append(v.Infringements, *infr) ps, err := agent.Penalize(*v) @@ -345,10 +352,11 @@ func (agent *Smith) Start(ctx context.Context, callback func(InfringingWorkspace } alreadyNotified, _ := agent.notifiedInfringements.ContainsOrAdd(v.VID(), nil) if alreadyNotified { - continue + return true } callback(*v, ps) - } + return true + }) case h := <-agent.perfHandler: if h == nil { continue @@ -409,18 +417,24 @@ func (agent *Smith) cleanupDeadPIDS(ctx context.Context) { } func (agent *Smith) cleanupDeadPidsCallback() { - for p, _ := range agent.pids { + agent.pidsMap.Range(func(key, value interface{}) bool { + p := key.(int) + process, _ := os.FindProcess(p) if process == nil { - delete(agent.pids, p) - continue + agent.pidsMap.Delete(p) + return true } err := process.Signal(syscall.Signal(0)) if err != nil { - delete(agent.pids, p) + agent.pidsMap.Delete(p) + return true } - } + + return true + }) + } // Penalize acts on infringements and e.g. stops pods @@ -594,7 +608,7 @@ func (agent *Smith) processPerfRecord(rec perf.Record) { func (agent *Smith) handleExecveEvent(execve Execve) func() (*InfringingWorkspace, error) { // this is not the exact process startup time // but for the type of comparison we need to do is enough - agent.pids[execve.TID] = time.Now() + agent.pidsMap.Store(execve.TID, time.Now()) return func() (*InfringingWorkspace, error) { if agent.Config.Blacklists == nil { diff --git a/components/ee/agent-smith/pkg/network/egress.go b/components/ee/agent-smith/pkg/network/egress.go index 78d02e515b1120..6fa7ed0db603f5 100644 --- a/components/ee/agent-smith/pkg/network/egress.go +++ b/components/ee/agent-smith/pkg/network/egress.go @@ -23,6 +23,9 @@ func readDeviceEgress(inpt io.Reader, dev string) (total int64, err error) { var totalEgress int64 = -1 //nolint:errcheck,staticcheck for rec, err := csvreader.Read(); rec != nil; rec, err = csvreader.Read() { + if err != nil { + return 0, err + } if len(rec) < 9 { continue } From e8d313be0381c99c4eea8bdd531d5352ac8177eb Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Thu, 1 Jul 2021 14:49:25 +0000 Subject: [PATCH 3/6] [agent-smith] Parse net dev and create an infringement --- chart/templates/agent-smith-configmap.yaml | 21 +- components/ee/agent-smith/BUILD.yaml | 2 +- components/ee/agent-smith/README.md | 6 + components/ee/agent-smith/example-config.json | 22 +- components/ee/agent-smith/pkg/agent/agent.go | 31 +- .../ee/agent-smith/pkg/agent/agent_test.go | 364 ------------------ .../ee/agent-smith/pkg/network/egress.go | 54 +-- 7 files changed, 72 insertions(+), 428 deletions(-) diff --git a/chart/templates/agent-smith-configmap.yaml b/chart/templates/agent-smith-configmap.yaml index 5a5534474e89a0..0344438dbf84f8 100644 --- a/chart/templates/agent-smith-configmap.yaml +++ b/chart/templates/agent-smith-configmap.yaml @@ -22,12 +22,29 @@ data: "blacklists": { "very": { "signatures": [ - {"name":"testtarget","domain":"process","kind":"elf","pattern":"YWdlbnRTbWl0aFRlc3RUYXJnZXQ=","regexp":false} + { + "name": "testtarget", + "domain": "process", + "kind": "elf", + "pattern": "YWdlbnRTbWl0aFRlc3RUYXJnZXQ=", + "regexp": false + } ] } }, "pprofAddr": "localhost:6060", "prometheusAddr": "localhost:9500", "hostURL": "https://{{ $.Values.hostname }}" + "egressTraffic": { + "dt": "2m", + "excessive": { + "baseBudget": "300Mi", + "perDtThreshold": "100Mi" + }, + "veryExcessive": { + "baseBudget": "2Gi", + "perDtThreshold": "250Mi" + } + } } -{{- end -}} \ No newline at end of file +{{- end -}} diff --git a/components/ee/agent-smith/BUILD.yaml b/components/ee/agent-smith/BUILD.yaml index 5e9185603bcbca..c15611a5d2c050 100644 --- a/components/ee/agent-smith/BUILD.yaml +++ b/components/ee/agent-smith/BUILD.yaml @@ -59,5 +59,5 @@ scripts: - components/ee/agent-smith/cmd/testbed:app - components/ee/agent-smith/cmd/testtarget:app script: | - scp vm ./components-ee-agent-smith--falco-bpf-probe/probe.o ./components-ee-agent-smith--app/agent-smith ./components-ee-agent-smith--example-config/example-config.json ./components-ee-agent-smith-cmd-testbed--app/testbed ./components-ee-agent-smith-cmd-testtarget--app/testtarget root@localhost:/ + scp -P 2222 -i ~/.ssh/id_rsa_vm -o StrictHostKeyChecking=no vm ./components-ee-agent-smith--falco-bpf-probe/probe.o ./components-ee-agent-smith--app/agent-smith ./components-ee-agent-smith--example-config/example-config.json ./components-ee-agent-smith-cmd-testbed--app/testbed ./components-ee-agent-smith-cmd-testtarget--app/testtarget root@localhost:/ echo "copied agent-smith to /" diff --git a/components/ee/agent-smith/README.md b/components/ee/agent-smith/README.md index a1ff580cf3a324..95eb0901100b3f 100644 --- a/components/ee/agent-smith/README.md +++ b/components/ee/agent-smith/README.md @@ -43,6 +43,12 @@ ssh vm If you now go under the `/workspace` folder in the VM, you will find all your workspace stuff. +If you want to compile with leeway and have the compiled artifacts in the VM you can do + +``` +leeway run components/ee/agent-smith:copy-to-qemu +``` + ## Falco libs BPF probe development In case you need to do development of new features or fix bugs against the diff --git a/components/ee/agent-smith/example-config.json b/components/ee/agent-smith/example-config.json index c2e0ca2f1567de..34855cbc6763a0 100644 --- a/components/ee/agent-smith/example-config.json +++ b/components/ee/agent-smith/example-config.json @@ -3,10 +3,26 @@ "probePath": "./probe.o", "blacklists": { "very": { - "binaries": ["find"], "signatures": [ - {"name":"testtarget","domain":"process","kind":"elf","pattern":"YWdlbnRTbWl0aFRlc3RUYXJnZXQ=","regexp":false} + { + "name": "testtarget", + "domain": "process", + "kind": "elf", + "pattern": "YWdlbnRTbWl0aFRlc3RUYXJnZXQ=", + "regexp": false + } ] } + }, + "egressTraffic": { + "dt": "2m", + "excessive": { + "baseBudget": "300Mi", + "perDtThreshold": "100Mi" + }, + "veryExcessive": { + "baseBudget": "2Gi", + "perDtThreshold": "250Mi" + } } -} \ No newline at end of file +} diff --git a/components/ee/agent-smith/pkg/agent/agent.go b/components/ee/agent-smith/pkg/agent/agent.go index b81b4c83538065..20d4e4ee035a59 100644 --- a/components/ee/agent-smith/pkg/agent/agent.go +++ b/components/ee/agent-smith/pkg/agent/agent.go @@ -47,7 +47,6 @@ type Smith struct { Config Config GitpodAPI gitpod.APIInterface EnforcementRules map[string]EnforcementRules - EgressTraffic *EgressTraffic metrics *metrics notifiedInfringements *lru.Cache @@ -332,20 +331,20 @@ func (agent *Smith) Start(ctx context.Context, callback func(InfringingWorkspace agent.pidsMap.Range(func(key, value interface{}) bool { p := key.(int) t := value.(time.Time) - infr, err := agent.checkEgressTrafficCallback(strconv.Itoa(p), t) + infr, err := agent.checkEgressTrafficCallback(p, t) if err != nil { - log.WithError(err).Warnf("error checking egress for pid: %d", p) return true } if infr == nil { return true } + var res []Infringement v, err := getWorkspaceFromProcess(p) if err != nil { - // this is not from a workspace, let's skip return true } - v.Infringements = append(v.Infringements, *infr) + res = append(res, *infr) + v.Infringements = res ps, err := agent.Penalize(*v) if err != nil { log.WithError(err).WithField("infringement", v).Warn("error while reacting to infringement") @@ -432,6 +431,12 @@ func (agent *Smith) cleanupDeadPidsCallback() { return true } + _, err = getWorkspaceFromProcess(p) + if err != nil { + agent.pidsMap.Delete(p) + return true + } + return true }) @@ -805,15 +810,17 @@ func (agent *Smith) RegisterMetrics(reg prometheus.Registerer) error { return agent.metrics.Register(reg) } -func (agent *Smith) checkEgressTrafficCallback(pid string, pidCreationTime time.Time) (*Infringement, error) { - if agent.EgressTraffic == nil { +func (agent *Smith) checkEgressTrafficCallback(pid int, pidCreationTime time.Time) (*Infringement, error) { + if agent.Config.EgressTraffic == nil { return nil, nil } + podLifetime := time.Since(pidCreationTime) resp, err := network.GetEgressTraffic(pid) if err != nil { return nil, err } + if resp <= 0 { log.WithField("total egress bytes", resp).Warn("GetEgressTraffic returned <= 0 value") return nil, nil @@ -824,14 +831,14 @@ func (agent *Smith) checkEgressTrafficCallback(pid string, pidCreationTime time. T *PerLevelEgressTraffic } levels := make([]level, 0, 2) - if agent.EgressTraffic.VeryExcessiveLevel != nil { - levels = append(levels, level{V: GradeKind(InfringementExcessiveEgress, InfringementSeverityVery), T: agent.EgressTraffic.VeryExcessiveLevel}) + if agent.Config.EgressTraffic.VeryExcessiveLevel != nil { + levels = append(levels, level{V: GradeKind(InfringementExcessiveEgress, InfringementSeverityVery), T: agent.Config.EgressTraffic.VeryExcessiveLevel}) } - if agent.EgressTraffic.ExcessiveLevel != nil { - levels = append(levels, level{V: GradeKind(InfringementExcessiveEgress, InfringementSeverityAudit), T: agent.EgressTraffic.ExcessiveLevel}) + if agent.Config.EgressTraffic.ExcessiveLevel != nil { + levels = append(levels, level{V: GradeKind(InfringementExcessiveEgress, InfringementSeverityAudit), T: agent.Config.EgressTraffic.ExcessiveLevel}) } - dt := int64(podLifetime / time.Duration(agent.EgressTraffic.WindowDuration)) + dt := int64(podLifetime / time.Duration(agent.Config.EgressTraffic.WindowDuration)) for _, lvl := range levels { allowance := dt*lvl.T.Threshold.Value() + lvl.T.BaseBudget.Value() excess := resp - allowance diff --git a/components/ee/agent-smith/pkg/agent/agent_test.go b/components/ee/agent-smith/pkg/agent/agent_test.go index 247b7c706f6d1a..abc83ec17772a3 100644 --- a/components/ee/agent-smith/pkg/agent/agent_test.go +++ b/components/ee/agent-smith/pkg/agent/agent_test.go @@ -93,367 +93,3 @@ func BenchmarkFindEnforcementRules(b *testing.B) { findEnforcementRules(rules, "foobar") } } - -// func TestCheckForSignature(t *testing.T) { -// sigs := []*signature.Signature{ -// &signature.Signature{Name: "fssig", Domain: signature.DomainFileSystem, Kind: signature.ObjectAny, Filename: []string{"foo"}, Pattern: []byte(base64.StdEncoding.EncodeToString([]byte("foo")))}, -// &signature.Signature{Name: "procsig", Domain: signature.DomainProcess, Kind: signature.ObjectAny, Pattern: []byte(base64.StdEncoding.EncodeToString([]byte("foo")))}, -// } - -// tests := []struct { -// Desc string -// Opts []NewAgentSmithOption -// Domain signature.Domain -// Sigs []*signature.Signature -// Mock func(t *testing.T, m *sentinel.MockSentinelServer) -// Severity InfringementSeverity -// Res *Infringement -// }{ -// { -// Desc: "filesystem domain", -// Domain: signature.DomainFileSystem, -// Sigs: sigs, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().FindSignature(gomock.Any(), gomock.Eq(&sentinel.FindSignatureRequest{ -// Domain: sentinel.FindSignatureDomain_Filesystem, -// Signature: sentinel.ConvertSignaturesToProtocol(sigs[0:1]), -// })).Return(&sentinel.FindSignatureResponse{}, nil).MinTimes(1) -// }, -// }, -// { -// Desc: "process domain", -// Domain: signature.DomainProcess, -// Sigs: sigs, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().FindSignature(gomock.Any(), gomock.Eq(&sentinel.FindSignatureRequest{ -// Domain: sentinel.FindSignatureDomain_Process, -// Signature: sentinel.ConvertSignaturesToProtocol(sigs[1:2]), -// })).Return(&sentinel.FindSignatureResponse{}, nil).MinTimes(1) -// }, -// }, -// { -// Desc: "returns infringement", -// Domain: signature.DomainProcess, -// Sigs: sigs, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().FindSignature(gomock.Any(), gomock.Any()).Return(&sentinel.FindSignatureResponse{ -// Target: "foobar", -// Signature: sentinel.ConvertSignaturesToProtocol(sigs)[1], -// }, nil).MinTimes(1) -// }, -// Severity: InfringementSeverityVery, -// Res: &Infringement{ -// Description: "user ran program matching procsig signature: foobar", -// Kind: GradeKind(InfringementExecBlacklistedCmd, InfringementSeverityVery), -// }, -// }, -// } -// for _, test := range tests { -// t.Run(test.Desc, func(t *testing.T) { -// agent, err := NewAgentSmith(fakek8s.NewSimpleClientset(), test.Opts...) -// if err != nil { -// t.Fatalf("cannot create test agent: %q", err) -// } - -// ctrl := gomock.NewController(t) -// defer ctrl.Finish() - -// ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) -// defer cancel() - -// sensrv := sentinel.NewMockSentinelServer(ctrl) -// test.Mock(t, sensrv) -// conn, err := connectToMockSentinel(ctx, sensrv) -// if err != nil { -// t.Fatalf("cannot connect to mock sentinel: %q", err) -// } - -// sen := sentinel.NewSentinelClient(conn) -// res, err := agent.checkForSignature(ctx, "foobar", sen, test.Severity, test.Domain, test.Sigs) -// if err != nil { -// t.Errorf("unexpected error: %q", err) -// } - -// if diff := cmp.Diff(test.Res, res); diff != "" { -// t.Errorf("unexpected result (-want +got):\n%s", diff) -// } -// }) -// } -// } - -// func TestCheckEgressTraffic(t *testing.T) { -// egressSettings := &EgressTraffic{ -// WindowDuration: util.Duration(2 * time.Minute), -// ExcessiveLevel: &PerLevelEgressTraffic{ -// BaseBudget: resource.MustParse("500Mi"), -// Threshold: resource.MustParse("100Mi"), -// }, -// VeryExcessiveLevel: &PerLevelEgressTraffic{ -// BaseBudget: resource.MustParse("1Gi"), -// Threshold: resource.MustParse("100Mi"), -// }, -// } -// var ( -// excessiveLevel = resource.MustParse("800Mi") -// veryExcessiveLevel = resource.MustParse("1200Mi") -// ) - -// tests := []struct { -// Desc string -// Opts []NewAgentSmithOption -// PodLifetime time.Duration -// Mock func(t *testing.T, m *sentinel.MockSentinelServer) -// Res *Infringement -// }{ -// { -// Desc: "no infringement", -// Opts: []NewAgentSmithOption{WithEgressTraffic(egressSettings)}, -// PodLifetime: 1 * time.Minute, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().GetEgressTraffic(gomock.Any(), gomock.Any()).Return(&sentinel.GetEgressTrafficResponse{ -// TotalBytes: 1, -// }, nil).MinTimes(1) -// }, -// }, -// { -// Desc: "excessive", -// Opts: []NewAgentSmithOption{WithEgressTraffic(egressSettings)}, -// PodLifetime: 1 * time.Minute, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().GetEgressTraffic(gomock.Any(), gomock.Any()).Return(&sentinel.GetEgressTrafficResponse{ -// TotalBytes: excessiveLevel.Value(), -// }, nil).MinTimes(1) -// }, -// Res: &Infringement{ -// Description: "egress traffic is 300.000 megabytes over limit", -// Kind: GradeKind(InfringementExcessiveEgress, InfringementSeverityAudit), -// }, -// }, -// { -// Desc: "very excessive", -// Opts: []NewAgentSmithOption{WithEgressTraffic(egressSettings)}, -// PodLifetime: 1 * time.Minute, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().GetEgressTraffic(gomock.Any(), gomock.Any()).Return(&sentinel.GetEgressTrafficResponse{ -// TotalBytes: veryExcessiveLevel.Value(), -// }, nil).MinTimes(1) -// }, -// Res: &Infringement{ -// Description: "egress traffic is 176.000 megabytes over limit", -// Kind: GradeKind(InfringementExcessiveEgress, InfringementSeverityVery), -// }, -// }, -// } -// for _, test := range tests { -// t.Run(test.Desc, func(t *testing.T) { -// agent, err := NewAgentSmith(fakek8s.NewSimpleClientset(), test.Opts...) -// if err != nil { -// t.Fatalf("cannot create test agent: %q", err) -// } - -// ctrl := gomock.NewController(t) -// defer ctrl.Finish() - -// ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) -// defer cancel() - -// sensrv := sentinel.NewMockSentinelServer(ctrl) -// test.Mock(t, sensrv) -// conn, err := connectToMockSentinel(ctx, sensrv) -// if err != nil { -// t.Fatalf("cannot connect to mock sentinel: %q", err) -// } - -// sen := sentinel.NewSentinelClient(conn) -// res, err := agent.checkEgressTraffic(ctx, &corev1.Pod{ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(time.Now().Add(-test.PodLifetime))}}, sen) -// if err != nil { -// t.Errorf("unexpected error: %q", err) -// } -// if diff := cmp.Diff(test.Res, res); diff != "" { -// t.Errorf("unexpected result (-want +got):\n%s", diff) -// } -// }) -// } -// } - -// func TestCheckCPUUse(t *testing.T) { -// tests := []struct { -// Desc string -// Opts []NewAgentSmithOption -// PodLifetime time.Duration -// Mock func(t *testing.T, m *sentinel.MockSentinelServer) -// Res *Infringement -// }{ -// { -// Desc: "no infringement", -// Opts: []NewAgentSmithOption{WithCPUUseCheck(4.5, 2)}, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().GetCPUInfo(gomock.Any(), gomock.Any()).Return(&sentinel.GetCPUInfoResponse{ -// Load: 3, -// TopCommand: "none", -// }, nil).MinTimes(1) -// }, -// }, -// { -// Desc: "infringement", -// Opts: []NewAgentSmithOption{WithCPUUseCheck(4.5, 2)}, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().GetCPUInfo(gomock.Any(), gomock.Any()).Return(&sentinel.GetCPUInfoResponse{ -// Load: 5, -// TopCommand: "evil-process", -// }, nil).MinTimes(1) -// }, -// Res: &Infringement{ -// Description: "CPU load 5.00 > 4.50 for 2 minutes. Top command is evil-process", -// Kind: GradeKind(InfringementExcessiveCPUUse, InfringementSeverityAudit), -// }, -// }, -// } -// for _, test := range tests { -// t.Run(test.Desc, func(t *testing.T) { -// agent, err := NewAgentSmith(fakek8s.NewSimpleClientset(), test.Opts...) -// if err != nil { -// t.Fatalf("cannot create test agent: %q", err) -// } - -// ctrl := gomock.NewController(t) -// defer ctrl.Finish() - -// ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) -// defer cancel() - -// sensrv := sentinel.NewMockSentinelServer(ctrl) -// test.Mock(t, sensrv) -// conn, err := connectToMockSentinel(ctx, sensrv) -// if err != nil { -// t.Fatalf("cannot connect to mock sentinel: %q", err) -// } - -// sen := sentinel.NewSentinelClient(conn) -// res, err := agent.checkForCPUUse(ctx, "foobar", sen) -// if err != nil { -// t.Errorf("unexpected error: %q", err) -// } -// if diff := cmp.Diff(test.Res, res); diff != "" { -// t.Errorf("unexpected result (-want +got):\n%s", diff) -// } -// }) -// } -// } - -// func TestBlacklistedCommand(t *testing.T) { -// binaries := func(p string) []string { -// return []string{p + "foo", p + "bar"} -// } -// blacklists := &Blacklists{ -// Barely: &PerLevelBlacklist{Binaries: binaries("b_")}, -// Audit: &PerLevelBlacklist{Binaries: binaries("d_")}, -// Very: &PerLevelBlacklist{Binaries: binaries("v_")}, -// } -// tests := []struct { -// Desc string -// Opts []NewAgentSmithOption -// Mock func(t *testing.T, m *sentinel.MockSentinelServer) -// Res *Infringement -// }{ -// { -// Desc: "no infringement", -// Opts: []NewAgentSmithOption{WithBlacklists(blacklists)}, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Any()).Return(&sentinel.IsCommandRunningRespose{}, nil).MinTimes(1) -// }, -// }, -// { -// Desc: "barely infringement", -// Opts: []NewAgentSmithOption{WithBlacklists(blacklists)}, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Eq(&sentinel.IsCommandRunningRequest{Command: binaries("b_")})).Return(&sentinel.IsCommandRunningRespose{ -// Findings: binaries("b_"), -// }, nil).MinTimes(1) -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Any()).Return(&sentinel.IsCommandRunningRespose{}, nil).MinTimes(2) -// }, -// Res: &Infringement{ -// Description: "user ran barely blacklisted command: b_foo", -// Kind: GradeKind(InfringementExecBlacklistedCmd, InfringementSeverityBarely), -// }, -// }, - -// { -// Desc: "default infringement", -// Opts: []NewAgentSmithOption{WithBlacklists(blacklists)}, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Eq(&sentinel.IsCommandRunningRequest{Command: binaries("d_")})).Return(&sentinel.IsCommandRunningRespose{ -// Findings: binaries("d_"), -// }, nil).MinTimes(1) -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Any()).Return(&sentinel.IsCommandRunningRespose{}, nil).MinTimes(1) -// }, -// Res: &Infringement{ -// Description: "user ran blacklisted command: d_foo", -// Kind: GradeKind(InfringementExecBlacklistedCmd, InfringementSeverityAudit), -// }, -// }, - -// { -// Desc: "very infringement", -// Opts: []NewAgentSmithOption{WithBlacklists(blacklists)}, -// Mock: func(t *testing.T, m *sentinel.MockSentinelServer) { -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Eq(&sentinel.IsCommandRunningRequest{Command: binaries("v_")})).Return(&sentinel.IsCommandRunningRespose{ -// Findings: binaries("v_"), -// }, nil).MinTimes(1) -// m.EXPECT().IsCommandRunning(gomock.Any(), gomock.Any()).Return(&sentinel.IsCommandRunningRespose{}, nil).MaxTimes(0) -// }, -// Res: &Infringement{ -// Description: "user ran very blacklisted command: v_foo", -// Kind: GradeKind(InfringementExecBlacklistedCmd, InfringementSeverityVery), -// }, -// }, -// } -// for _, test := range tests { -// t.Run(test.Desc, func(t *testing.T) { -// agent, err := NewAgentSmith(fakek8s.NewSimpleClientset(), test.Opts...) -// if err != nil { -// t.Fatalf("cannot create test agent: %q", err) -// } - -// ctrl := gomock.NewController(t) -// defer ctrl.Finish() - -// ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) -// defer cancel() - -// sensrv := sentinel.NewMockSentinelServer(ctrl) -// test.Mock(t, sensrv) -// conn, err := connectToMockSentinel(ctx, sensrv) -// if err != nil { -// t.Fatalf("cannot connect to mock sentinel: %q", err) -// } - -// sen := sentinel.NewSentinelClient(conn) -// res, err := agent.checkForBlacklistedCommand(ctx, "foobar", sen) -// if err != nil { -// t.Errorf("unexpected error: %q", err) -// } -// if diff := cmp.Diff(test.Res, res); diff != "" { -// t.Errorf("unexpected result (-want +got):\n%s", diff) -// } -// }) -// } -// } - -// func connectToMockSentinel(ctx context.Context, sensrv sentinel.SentinelServer) (*grpc.ClientConn, error) { -// lis := bufconn.Listen(1024 * 1024) -// srv := grpc.NewServer() -// sentinel.RegisterSentinelServer(srv, sensrv) -// go func() { -// err := srv.Serve(lis) -// if err != nil { -// panic(fmt.Sprintf("grpc failure: %q", err)) -// } -// }() - -// conn, err := grpc.DialContext(ctx, "bufnet", grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { return lis.Dial() }), grpc.WithInsecure()) -// if err != nil { -// return nil, err -// } -// return conn, nil -// } diff --git a/components/ee/agent-smith/pkg/network/egress.go b/components/ee/agent-smith/pkg/network/egress.go index 6fa7ed0db603f5..98fd2f0e013990 100644 --- a/components/ee/agent-smith/pkg/network/egress.go +++ b/components/ee/agent-smith/pkg/network/egress.go @@ -5,58 +5,20 @@ package network import ( - "encoding/csv" - "fmt" - "io" - "os" - "path" - "strconv" - "strings" + "github.com/prometheus/procfs" ) -func readDeviceEgress(inpt io.Reader, dev string) (total int64, err error) { - csvreader := csv.NewReader(inpt) - csvreader.Comma = ' ' - csvreader.FieldsPerRecord = 17 - csvreader.TrimLeadingSpace = true - - var totalEgress int64 = -1 - //nolint:errcheck,staticcheck - for rec, err := csvreader.Read(); rec != nil; rec, err = csvreader.Read() { - if err != nil { - return 0, err - } - if len(rec) < 9 { - continue - } - if !strings.HasPrefix(rec[0], dev) { - continue - } - - totalEgress, err = strconv.ParseInt(rec[9], 10, 64) - if err != nil { - return 0, err - } - break - } - if totalEgress < 0 { - return 0, fmt.Errorf("did not find interface") - } - - return totalEgress, nil -} - -func GetEgressTraffic(pid string) (int64, error) { - file, err := os.OpenFile(path.Join("/proc", pid, "/net/dev"), os.O_RDONLY, 0600) +func GetEgressTraffic(pid int) (int64, error) { + pproc, err := procfs.NewProc(pid) if err != nil { - return 0, err + return -1, err } - defer file.Close() - totalEgress, err := readDeviceEgress(file, "eth0") + nd, err := pproc.NetDev() + if err != nil { - return 0, err + return -1, err } - return totalEgress, nil + return int64(nd.Total().TxBytes), nil } From 7ebfcf518bf50f6ef59df28f36ac778e5462cd74 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Mon, 5 Jul 2021 17:04:59 +0000 Subject: [PATCH 4/6] [agent-smith] tests for Smith.checkEgressTrafficCallback --- components/ee/agent-smith/BUILD.yaml | 1 + components/ee/agent-smith/pkg/agent/agent.go | 87 ++++------- .../ee/agent-smith/pkg/agent/agent_test.go | 138 ++++++++++++++++++ .../pkg/{network => agent}/egress.go | 4 +- ...agent_check_egress_excessive_egress.golden | 13 ++ .../agent_check_egress_no_infringement.golden | 13 ++ ..._check_egress_very_excessive_egress.golden | 13 ++ ...egress_very_excessive_egress_simple.golden | 13 ++ .../agent_check_egress_zero_egress.golden | 4 + 9 files changed, 223 insertions(+), 63 deletions(-) rename components/ee/agent-smith/pkg/{network => agent}/egress.go (86%) create mode 100644 components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_excessive_egress.golden create mode 100644 components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_no_infringement.golden create mode 100644 components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress.golden create mode 100644 components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress_simple.golden create mode 100644 components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_zero_egress.golden diff --git a/components/ee/agent-smith/BUILD.yaml b/components/ee/agent-smith/BUILD.yaml index c15611a5d2c050..8740af0a2e0c06 100644 --- a/components/ee/agent-smith/BUILD.yaml +++ b/components/ee/agent-smith/BUILD.yaml @@ -4,6 +4,7 @@ packages: - name: app type: go srcs: + - "pkg/agent/testdata/**" - "**/*.go" - "go.mod" - "go.sum" diff --git a/components/ee/agent-smith/pkg/agent/agent.go b/components/ee/agent-smith/pkg/agent/agent.go index 20d4e4ee035a59..cfddea81a612de 100644 --- a/components/ee/agent-smith/pkg/agent/agent.go +++ b/components/ee/agent-smith/pkg/agent/agent.go @@ -22,7 +22,6 @@ import ( "github.com/cilium/ebpf/perf" "github.com/gitpod-io/gitpod/agent-smith/pkg/bpf" - "github.com/gitpod-io/gitpod/agent-smith/pkg/network" "github.com/gitpod-io/gitpod/agent-smith/pkg/signature" "github.com/gitpod-io/gitpod/common-go/log" "github.com/gitpod-io/gitpod/common-go/util" @@ -52,6 +51,9 @@ type Smith struct { notifiedInfringements *lru.Cache perfHandler chan perfHandlerFunc pidsMap sync.Map + + egressTrafficCheckHandler func(pid int) (int64, error) + timeElapsedHandler func(t time.Time) time.Duration } // EgressTraffic configures an upper limit of allowed egress traffic over time @@ -138,12 +140,13 @@ func NewAgentSmith(cfg Config) (*Smith, error) { GradeKind(InfringementExcessiveEgress, InfringementSeverityVery): PenaltyStopWorkspace, }, }, - Config: cfg, - GitpodAPI: api, - notifiedInfringements: notificationCache, - perfHandler: make(chan perfHandlerFunc, 10), - metrics: newAgentMetrics(), - pidsMap: sync.Map{}, + Config: cfg, + GitpodAPI: api, + notifiedInfringements: notificationCache, + perfHandler: make(chan perfHandlerFunc, 10), + metrics: newAgentMetrics(), + egressTrafficCheckHandler: getEgressTraffic, + timeElapsedHandler: time.Since, } if cfg.Enforcement.Default != nil { if err := cfg.Enforcement.Default.Validate(); err != nil { @@ -246,7 +249,10 @@ type GradedInfringementKind string // GradeKind produces a graded infringement kind from severity and kind func GradeKind(kind InfringementKind, severity InfringementSeverity) GradedInfringementKind { - return GradedInfringementKind(fmt.Sprintf("%s %s", severity, kind)) + if len(severity) > 0 { + return GradedInfringementKind(fmt.Sprintf("%s %s", severity, kind)) + } + return GradedInfringementKind(kind) } // Severity returns the severity of the graded infringement kind @@ -319,7 +325,7 @@ func (agent *Smith) Start(ctx context.Context, callback func(InfringingWorkspace defer abpf.Close() - agent.cleanupDeadPIDS(ctx) + go agent.cleanupDeadPIDS(ctx) egressTicker := time.NewTicker(30 * time.Second) @@ -402,17 +408,15 @@ func (agent *Smith) Start(ctx context.Context, callback func(InfringingWorkspace func (agent *Smith) cleanupDeadPIDS(ctx context.Context) { ticker := time.NewTicker(30 * time.Second) - go func() { - for { - select { - case <-ticker.C: - agent.cleanupDeadPidsCallback() - case <-ctx.Done(): - ticker.Stop() - return - } + defer ticker.Stop() + for { + select { + case <-ticker.C: + agent.cleanupDeadPidsCallback() + case <-ctx.Done(): + return } - }() + } } func (agent *Smith) cleanupDeadPidsCallback() { @@ -559,6 +563,7 @@ func parseExecveExit(evtHdr EventHeader, buffer []byte) Execve { dataOffsetPtr := unsafe.Sizeof(evtHdr) + unsafe.Sizeof(i)*uintptr(evtHdr.NParams) - 6 // todo(fntlnz): check why this -6 is necessary scratchHeaderOffset := uint32(dataOffsetPtr) + //lint:ignore SA4006 this is used with unsafe.Sizeof retval := int64(buffer[scratchHeaderOffset]) // einfo := bpf.EventTable[bpf.PPME_SYSCALL_EXECVE_19_X] @@ -765,46 +770,6 @@ func getWorkspaceFromProcess(tid int) (res *InfringingWorkspace, err error) { }, nil } -//nolint:deadcode,unused -func mergeInfringingWorkspaces(vws []InfringingWorkspace) (vw InfringingWorkspace) { - for _, r := range vws { - if vw.Pod == "" { - vw.Pod = r.Pod - } - if vw.Owner == "" { - vw.Owner = r.Owner - } - if vw.InstanceID == "" { - vw.InstanceID = r.InstanceID - } - if vw.WorkspaceID == "" { - vw.WorkspaceID = r.WorkspaceID - } - - // Note: the remote URL list is likekly to be very small hence the O^2 complexity is ok - // And just in case the remote URL list isn't small we have a circuit breaker - if len(r.GitRemoteURL) < 100 && len(vw.GitRemoteURL) < 100 { - for _, rr := range r.GitRemoteURL { - var found bool - for _, vr := range vw.GitRemoteURL { - if rr == vr { - found = true - break - } - } - if !found { - vw.GitRemoteURL = append(vw.GitRemoteURL, rr) - } - } - } else if len(vw.GitRemoteURL) == 0 { - vw.GitRemoteURL = r.GitRemoteURL - } - - vw.Infringements = append(vw.Infringements, r.Infringements...) - } - return -} - // RegisterMetrics registers prometheus metrics for this driver func (agent *Smith) RegisterMetrics(reg prometheus.Registerer) error { return agent.metrics.Register(reg) @@ -815,8 +780,8 @@ func (agent *Smith) checkEgressTrafficCallback(pid int, pidCreationTime time.Tim return nil, nil } - podLifetime := time.Since(pidCreationTime) - resp, err := network.GetEgressTraffic(pid) + podLifetime := agent.timeElapsedHandler(pidCreationTime) + resp, err := agent.egressTrafficCheckHandler(pid) if err != nil { return nil, err } diff --git a/components/ee/agent-smith/pkg/agent/agent_test.go b/components/ee/agent-smith/pkg/agent/agent_test.go index abc83ec17772a3..58ea4e73194aff 100644 --- a/components/ee/agent-smith/pkg/agent/agent_test.go +++ b/components/ee/agent-smith/pkg/agent/agent_test.go @@ -5,8 +5,14 @@ package agent import ( + "encoding/json" + "fmt" + "io/ioutil" + "path" + "reflect" "sort" "testing" + "time" "github.com/google/go-cmp/cmp" ) @@ -93,3 +99,135 @@ func BenchmarkFindEnforcementRules(b *testing.B) { findEnforcementRules(rules, "foobar") } } + +func TestCeckEgressTrafficCallback(t *testing.T) { + simpleTime, _ := time.Parse(time.RFC3339, "2021-07-05T15:16:17+02:00") + + type args struct { + pid int + pidCreationTime time.Time + } + tests := map[string]struct { + args args + want *Infringement + egressTrafficCheckHandler func(pid int) (int64, error) + timeElapsedHandler func(t time.Time) time.Duration + wantErr bool + }{ + "no_infringement": { + args: args{ + pid: 1234, + pidCreationTime: simpleTime, + }, + want: nil, + egressTrafficCheckHandler: func(pid int) (int64, error) { + return 2000000, nil + }, + timeElapsedHandler: func(t time.Time) time.Duration { + d, _ := time.ParseDuration("1m") + return d + }, + wantErr: false, + }, + "zero_egress": { + args: args{ + pid: 1234, + pidCreationTime: simpleTime, + }, + want: nil, + egressTrafficCheckHandler: func(pid int) (int64, error) { + return 0, nil + }, + timeElapsedHandler: func(t time.Time) time.Duration { + d, _ := time.ParseDuration("1m") + return d + }, + wantErr: false, + }, + "excessive_egress": { + args: args{ + pid: 1234, + pidCreationTime: simpleTime, + }, + want: &Infringement{ + Kind: GradedInfringementKind(InfringementExcessiveEgress), + Description: "egress traffic is 12.805 megabytes over limit", + }, + egressTrafficCheckHandler: func(pid int) (int64, error) { + return 328000000, nil + }, + timeElapsedHandler: func(t time.Time) time.Duration { + d, _ := time.ParseDuration("1m") + return d + }, + wantErr: false, + }, + "very_excessive_egress_simple": { + args: args{ + pid: 1234, + pidCreationTime: simpleTime, + }, + want: &Infringement{ + Kind: GradedInfringementKind(InfringementVeryExcessiveEgress), + Description: "egress traffic is 188686.863 megabytes over limit", + }, + egressTrafficCheckHandler: func(pid int) (int64, error) { + return 200000000000, nil + }, + timeElapsedHandler: func(t time.Time) time.Duration { + d, _ := time.ParseDuration("1s") + return d + }, + wantErr: false, + }, + "very_excessive_egress": { + args: args{ + pid: 1234, + pidCreationTime: simpleTime, + }, + want: &Infringement{ + Kind: GradedInfringementKind(InfringementVeryExcessiveEgress), + Description: "egress traffic is 188686.863 megabytes over limit", + }, + egressTrafficCheckHandler: func(pid int) (int64, error) { + return 200000000000, nil + }, + timeElapsedHandler: func(t time.Time) time.Duration { + d, _ := time.ParseDuration("1m") + return d + }, + wantErr: false, + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + fc, err := ioutil.ReadFile(path.Join("testdata", fmt.Sprintf("agent_check_egress_%s.golden", name))) + if err != nil { + t.Errorf("cannot read config: %v", err) + return + } + var cfg Config + err = json.Unmarshal(fc, &cfg) + if err != nil { + t.Errorf("cannot unmarshal config: %v", err) + return + } + agent, err := NewAgentSmith(cfg) + if err != nil { + t.Errorf("cannot create test agent smith from config: %v", err) + return + } + agent.egressTrafficCheckHandler = tt.egressTrafficCheckHandler + agent.timeElapsedHandler = tt.timeElapsedHandler + got, err := agent.checkEgressTrafficCallback(tt.args.pid, tt.args.pidCreationTime) + if (err != nil) != tt.wantErr { + t.Errorf("Smith.checkEgressTrafficCallback() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("Smith.checkEgressTrafficCallback() = %s", cmp.Diff(got, tt.want)) + } + }) + } +} diff --git a/components/ee/agent-smith/pkg/network/egress.go b/components/ee/agent-smith/pkg/agent/egress.go similarity index 86% rename from components/ee/agent-smith/pkg/network/egress.go rename to components/ee/agent-smith/pkg/agent/egress.go index 98fd2f0e013990..bd6c07f1546e62 100644 --- a/components/ee/agent-smith/pkg/network/egress.go +++ b/components/ee/agent-smith/pkg/agent/egress.go @@ -2,13 +2,13 @@ // Licensed under the Gitpod Enterprise Source Code License, // See License.enterprise.txt in the project root folder. -package network +package agent import ( "github.com/prometheus/procfs" ) -func GetEgressTraffic(pid int) (int64, error) { +func getEgressTraffic(pid int) (int64, error) { pproc, err := procfs.NewProc(pid) if err != nil { return -1, err diff --git a/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_excessive_egress.golden b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_excessive_egress.golden new file mode 100644 index 00000000000000..d1c4e0b77e6004 --- /dev/null +++ b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_excessive_egress.golden @@ -0,0 +1,13 @@ +{ + "egressTraffic": { + "dt": "2m", + "excessive": { + "baseBudget": "300Mi", + "perDtThreshold": "100Mi" + }, + "veryExcessive": { + "baseBudget": "2Gi", + "perDtThreshold": "250Mi" + } + } +} diff --git a/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_no_infringement.golden b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_no_infringement.golden new file mode 100644 index 00000000000000..d1c4e0b77e6004 --- /dev/null +++ b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_no_infringement.golden @@ -0,0 +1,13 @@ +{ + "egressTraffic": { + "dt": "2m", + "excessive": { + "baseBudget": "300Mi", + "perDtThreshold": "100Mi" + }, + "veryExcessive": { + "baseBudget": "2Gi", + "perDtThreshold": "250Mi" + } + } +} diff --git a/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress.golden b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress.golden new file mode 100644 index 00000000000000..b4d32612430169 --- /dev/null +++ b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress.golden @@ -0,0 +1,13 @@ +{ + "egressTraffic": { + "dt": "1h", + "excessive": { + "baseBudget": "300Mi", + "perDtThreshold": "100Mi" + }, + "veryExcessive": { + "baseBudget": "2Gi", + "perDtThreshold": "250Mi" + } + } +} diff --git a/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress_simple.golden b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress_simple.golden new file mode 100644 index 00000000000000..d1c4e0b77e6004 --- /dev/null +++ b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_very_excessive_egress_simple.golden @@ -0,0 +1,13 @@ +{ + "egressTraffic": { + "dt": "2m", + "excessive": { + "baseBudget": "300Mi", + "perDtThreshold": "100Mi" + }, + "veryExcessive": { + "baseBudget": "2Gi", + "perDtThreshold": "250Mi" + } + } +} diff --git a/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_zero_egress.golden b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_zero_egress.golden new file mode 100644 index 00000000000000..617daefadbcac8 --- /dev/null +++ b/components/ee/agent-smith/pkg/agent/testdata/agent_check_egress_zero_egress.golden @@ -0,0 +1,4 @@ +{ + "egressTraffic": { + } +} From da335badea03ab7334c5f020327807f36ec55459 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Tue, 6 Jul 2021 13:10:34 +0000 Subject: [PATCH 5/6] [agent-smith] counter for monitored processes --- chart/templates/agent-smith-configmap.yaml | 2 +- components/ee/agent-smith/pkg/agent/agent.go | 19 +++-- .../ee/agent-smith/pkg/agent/metrics.go | 76 +++++++++++++++---- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/chart/templates/agent-smith-configmap.yaml b/chart/templates/agent-smith-configmap.yaml index 0344438dbf84f8..05ad8e5908e9e0 100644 --- a/chart/templates/agent-smith-configmap.yaml +++ b/chart/templates/agent-smith-configmap.yaml @@ -34,7 +34,7 @@ data: }, "pprofAddr": "localhost:6060", "prometheusAddr": "localhost:9500", - "hostURL": "https://{{ $.Values.hostname }}" + "hostURL": "https://{{ $.Values.hostname }}", "egressTraffic": { "dt": "2m", "excessive": { diff --git a/components/ee/agent-smith/pkg/agent/agent.go b/components/ee/agent-smith/pkg/agent/agent.go index cfddea81a612de..b8da086a1f2584 100644 --- a/components/ee/agent-smith/pkg/agent/agent.go +++ b/components/ee/agent-smith/pkg/agent/agent.go @@ -15,7 +15,6 @@ import ( "sort" "strconv" "strings" - "sync" "syscall" "time" "unsafe" @@ -50,7 +49,7 @@ type Smith struct { notifiedInfringements *lru.Cache perfHandler chan perfHandlerFunc - pidsMap sync.Map + pidsMap syncMapCounter egressTrafficCheckHandler func(pid int) (int64, error) timeElapsedHandler func(t time.Time) time.Duration @@ -128,6 +127,10 @@ func NewAgentSmith(cfg Config) (*Smith, error) { } } + m := newAgentMetrics() + pidsMap := syncMapCounter{} + pidsMap.WithCounter(m.currentlyMonitoredPIDS) + res := &Smith{ EnforcementRules: map[string]EnforcementRules{ defaultRuleset: { @@ -144,9 +147,10 @@ func NewAgentSmith(cfg Config) (*Smith, error) { GitpodAPI: api, notifiedInfringements: notificationCache, perfHandler: make(chan perfHandlerFunc, 10), - metrics: newAgentMetrics(), + metrics: m, egressTrafficCheckHandler: getEgressTraffic, timeElapsedHandler: time.Since, + pidsMap: pidsMap, } if cfg.Enforcement.Default != nil { if err := cfg.Enforcement.Default.Validate(); err != nil { @@ -249,10 +253,10 @@ type GradedInfringementKind string // GradeKind produces a graded infringement kind from severity and kind func GradeKind(kind InfringementKind, severity InfringementSeverity) GradedInfringementKind { - if len(severity) > 0 { - return GradedInfringementKind(fmt.Sprintf("%s %s", severity, kind)) + if len(severity) == 0 { + return GradedInfringementKind(kind) } - return GradedInfringementKind(kind) + return GradedInfringementKind(fmt.Sprintf("%s %s", severity, kind)) } // Severity returns the severity of the graded infringement kind @@ -419,6 +423,8 @@ func (agent *Smith) cleanupDeadPIDS(ctx context.Context) { } } +// cleanupDeadPidsCallback removes from pidsMap all the process IDs +// that are not active anymore or don't have a workspace associated func (agent *Smith) cleanupDeadPidsCallback() { agent.pidsMap.Range(func(key, value interface{}) bool { p := key.(int) @@ -443,7 +449,6 @@ func (agent *Smith) cleanupDeadPidsCallback() { return true }) - } // Penalize acts on infringements and e.g. stops pods diff --git a/components/ee/agent-smith/pkg/agent/metrics.go b/components/ee/agent-smith/pkg/agent/metrics.go index e63a8e87910362..d96fb9033e8889 100644 --- a/components/ee/agent-smith/pkg/agent/metrics.go +++ b/components/ee/agent-smith/pkg/agent/metrics.go @@ -4,13 +4,23 @@ package agent -import "github.com/prometheus/client_golang/prometheus" +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + counterMapStatusStored = "stored" + counterMapStatusDeleted = "deleted" +) type metrics struct { penaltyAttempts *prometheus.CounterVec penaltyFailures *prometheus.CounterVec signatureCheckMiss prometheus.Counter signatureCheckFailures prometheus.Counter + currentlyMonitoredPIDS *prometheus.CounterVec } func newAgentMetrics() *metrics { @@ -32,18 +42,30 @@ func newAgentMetrics() *metrics { Help: "The total amount of failed attempts that agent-smith is trying to apply a penalty.", }, []string{"penalty", "reason"}, ) - m.signatureCheckMiss = prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "gitpod", - Subsystem: "agent_smith", - Name: "signature_check_missed_total", - Help: "The total amount of times where the processes ended before we could open the executable.", - }) - m.signatureCheckFailures = prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "gitpod", - Subsystem: "agent_smith", - Name: "signature_check_failed_total", - Help: "The total amount of failed signature check attempts", - }) + m.signatureCheckMiss = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: "gitpod", + Subsystem: "agent_smith", + Name: "signature_check_missed_total", + Help: "The total amount of times where the processes ended before we could open the executable.", + }, + ) + m.signatureCheckFailures = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: "gitpod", + Subsystem: "agent_smith", + Name: "signature_check_failed_total", + Help: "The total amount of failed signature check attempts", + }, + ) + m.currentlyMonitoredPIDS = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "gitpod", + Subsystem: "agent_smith", + Name: "monitored_pids", + Help: "Current count of pids under investigation", + }, []string{"process_state"}, + ) return m } @@ -57,6 +79,7 @@ func (m *metrics) Register(reg prometheus.Registerer) error { m.penaltyFailures, m.signatureCheckMiss, m.signatureCheckFailures, + m.currentlyMonitoredPIDS, } for _, c := range collectors { err := reg.Register(c) @@ -67,3 +90,30 @@ func (m *metrics) Register(reg prometheus.Registerer) error { return nil } + +type syncMapCounter struct { + sync.Map + counter *prometheus.CounterVec +} + +func (m *syncMapCounter) WithCounter(c *prometheus.CounterVec) { + m.counter = c +} + +func (m *syncMapCounter) Store(key, value interface{}) { + m.Map.Store(key, value) + if m.counter != nil { + m.counter.WithLabelValues(counterMapStatusStored).Inc() + } +} + +func (m *syncMapCounter) Delete(key interface{}) { + m.Map.Delete(key) + if m.counter != nil { + m.counter.WithLabelValues(counterMapStatusDeleted).Inc() + } +} + +func (m *syncMapCounter) Range(f func(key, value interface{}) bool) { + m.Map.Range(f) +} From 9a45f53a415d5963338f794fa7fa811cc93e91ee Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Wed, 7 Jul 2021 10:19:55 +0000 Subject: [PATCH 6/6] [agent-smith] egress pids map store only if process is supervisor --- components/ee/agent-smith/pkg/agent/agent.go | 33 ++++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/components/ee/agent-smith/pkg/agent/agent.go b/components/ee/agent-smith/pkg/agent/agent.go index b8da086a1f2584..f95b5587822770 100644 --- a/components/ee/agent-smith/pkg/agent/agent.go +++ b/components/ee/agent-smith/pkg/agent/agent.go @@ -11,6 +11,7 @@ import ( "fmt" "net/url" "os" + "path" "path/filepath" "sort" "strconv" @@ -619,11 +620,37 @@ func (agent *Smith) processPerfRecord(rec perf.Record) { } } +// isSupervisor checks if the execve syscall +// is relative to a supervisor process +// This check must be very fast to avoid blocking the +// reading of the perf buffer so no library have been used +// like prometheus/procfs which reads the whole process tree before +// allowing to read the executable path +// What does it do +// - check if the binary name is supervisor +// - check if it is the actual supervisor we ship in the workspace +func isSupervisor(execve Execve) bool { + if execve.Filename == "supervisor" { + exePath := path.Join("/proc", strconv.Itoa(execve.TID), "exe") + + // error checking is skipped because the readlink syscall will not find + // the destiantion path since its in another mount namespace. + absPath, _ := os.Readlink(exePath) + if absPath == "/.supervisor/supervisor" { + return true + } + } + return false +} + // handles an execve event checks if it's infringing func (agent *Smith) handleExecveEvent(execve Execve) func() (*InfringingWorkspace, error) { - // this is not the exact process startup time - // but for the type of comparison we need to do is enough - agent.pidsMap.Store(execve.TID, time.Now()) + + if isSupervisor(execve) { + // this is not the exact process startup time + // but for the type of comparison we need to do is enough + agent.pidsMap.Store(execve.TID, time.Now()) + } return func() (*InfringingWorkspace, error) { if agent.Config.Blacklists == nil {