diff --git a/internal/server/metrics.go b/internal/server/metrics.go index b5a694f..2c32f44 100644 --- a/internal/server/metrics.go +++ b/internal/server/metrics.go @@ -8,21 +8,21 @@ var ( Help: "Time that a workflow job took to reach a given state.", Buckets: prometheus.ExponentialBuckets(1, 1.4, 30), }, - []string{"org", "repo", "state", "runner_group"}, + []string{"org", "repo", "state", "runner_group", "workflow_name", "job_name"}, ) workflowJobDurationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "workflow_job_duration_seconds_total", Help: "The total duration of jobs.", }, - []string{"org", "repo", "status", "conclusion", "runner_group"}, + []string{"org", "repo", "status", "conclusion", "runner_group", "workflow_name", "job_name"}, ) workflowJobStatusCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "workflow_job_status_count", Help: "Count of workflow job events.", }, - []string{"org", "repo", "status", "conclusion", "runner_group"}, + []string{"org", "repo", "status", "conclusion", "runner_group", "workflow_name", "job_name"}, ) workflowRunHistogramVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ @@ -107,9 +107,10 @@ func init() { } type WorkflowObserver interface { - ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64) - CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string) - CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64) + ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64) + CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string) + CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64) + ObserveWorkflowRunDuration(org, repo, workflow, conclusion string, seconds float64) CountWorkflowRunStatus(org, repo, status, conclusion, workflow string) } @@ -118,17 +119,17 @@ var _ WorkflowObserver = (*PrometheusObserver)(nil) type PrometheusObserver struct{} -func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64) { - workflowJobHistogramVec.WithLabelValues(org, repo, state, runnerGroup). +func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64) { + workflowJobHistogramVec.WithLabelValues(org, repo, state, runnerGroup, workflowName, jobName). Observe(seconds) } -func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string) { - workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Inc() +func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string) { + workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup, workflowName, jobName).Inc() } -func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64) { - workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Add(seconds) +func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64) { + workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup, workflowName, jobName).Add(seconds) } func (o *PrometheusObserver) ObserveWorkflowRunDuration(org, repo, workflowName, conclusion string, seconds float64) { diff --git a/internal/server/server_test.go b/internal/server/server_test.go index e8c6a96..6b11d90 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -92,6 +92,8 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) { jobStartedAt := time.Unix(1650308740, 0) completedAt := jobStartedAt.Add(time.Duration(expectedDuration) * time.Second) runnerGroupName := "runner-group" + workflowName := "Build and test" + jobName := "Test" event := github.WorkflowJobEvent{ Action: github.String("completed"), @@ -107,6 +109,8 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) { StartedAt: &github.Timestamp{Time: jobStartedAt}, CompletedAt: &github.Timestamp{Time: completedAt}, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "http://localhost:8001/webhook", "workflow_job", event) @@ -125,6 +129,6 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) { payload, err := io.ReadAll(metricsRes.Body) require.NoError(t, err) - assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",le="10.541350399999995"} 1`) - assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",org="someone",repo="some-repo",runner_group="runner-group",status="completed"} 10`) + assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{job_name="Test",org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",workflow_name="Build and test",le="10.541350399999995"} 1`) + assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",job_name="Test",org="someone",repo="some-repo",runner_group="runner-group",status="completed",workflow_name="Build and test"} 10`) } diff --git a/internal/server/workflow_metrics_exporter.go b/internal/server/workflow_metrics_exporter.go index da75160..8d25d8e 100644 --- a/internal/server/workflow_metrics_exporter.go +++ b/internal/server/workflow_metrics_exporter.go @@ -74,7 +74,13 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http return case "workflow_job": event := model.WorkflowJobEventFromJSON(io.NopCloser(bytes.NewBuffer(buf))) - _ = level.Info(c.Logger).Log("msg", "got workflow_job event", "org", event.GetRepo().GetOwner().GetLogin(), "repo", event.GetRepo().GetName(), "runId", event.GetWorkflowJob().GetRunID(), "action", event.GetAction()) + _ = level.Info(c.Logger).Log("msg", "got workflow_job event", + "org", event.GetRepo().GetOwner().GetLogin(), + "repo", event.GetRepo().GetName(), + "runId", event.GetWorkflowJob().GetRunID(), + "action", event.GetAction(), + "workflow_name", event.GetWorkflowJob().GetWorkflowName(), + "job_name", event.GetWorkflowJob().GetName()) go c.CollectWorkflowJobEvent(event) case "workflow_run": event := model.WorkflowRunEventFromJSON(io.NopCloser(bytes.NewBuffer(buf))) @@ -93,37 +99,40 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http func (c *WorkflowMetricsExporter) CollectWorkflowJobEvent(event *github.WorkflowJobEvent) { repo := event.GetRepo().GetName() org := event.GetRepo().GetOwner().GetLogin() - runnerGroup := event.WorkflowJob.GetRunnerGroupName() - action := event.GetAction() - conclusion := event.GetWorkflowJob().GetConclusion() - status := event.GetWorkflowJob().GetStatus() + + workflowJob := event.GetWorkflowJob() + runnerGroup := workflowJob.GetRunnerGroupName() + conclusion := workflowJob.GetConclusion() + status := workflowJob.GetStatus() + workflowName := workflowJob.GetWorkflowName() + jobName := workflowJob.GetName() switch action { case "queued": // Do nothing. case "in_progress": - if len(event.WorkflowJob.Steps) == 0 { + if len(workflowJob.Steps) == 0 { _ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of in_progress event as event has no steps") break } - firstStep := event.WorkflowJob.Steps[0] - queuedSeconds := firstStep.StartedAt.Time.Sub(event.WorkflowJob.StartedAt.Time).Seconds() - c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, math.Max(0, queuedSeconds)) + firstStep := workflowJob.Steps[0] + queuedSeconds := firstStep.StartedAt.Time.Sub(workflowJob.GetStartedAt().Time).Seconds() + c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, workflowName, jobName, math.Max(0, queuedSeconds)) case "completed": - if event.WorkflowJob.StartedAt == nil || event.WorkflowJob.CompletedAt == nil { + if workflowJob.StartedAt == nil || workflowJob.CompletedAt == nil { _ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of completed event steps are missing timestamps") break } - jobSeconds := math.Max(0, event.WorkflowJob.GetCompletedAt().Time.Sub(event.WorkflowJob.GetStartedAt().Time).Seconds()) - c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, jobSeconds) - c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, jobSeconds) + jobSeconds := math.Max(0, workflowJob.GetCompletedAt().Time.Sub(workflowJob.GetStartedAt().Time).Seconds()) + c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, workflowName, jobName, jobSeconds) + c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName, jobSeconds) } - c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup) + c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName) } func (c *WorkflowMetricsExporter) CollectWorkflowRunEvent(event *github.WorkflowRunEvent) { diff --git a/internal/server/workflow_metrics_exporter_test.go b/internal/server/workflow_metrics_exporter_test.go index b7dfc32..9eec539 100644 --- a/internal/server/workflow_metrics_exporter_test.go +++ b/internal/server/workflow_metrics_exporter_test.go @@ -126,6 +126,9 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobQueuedEvent(t *testing.T) action := "completed" status := "completed" conclusion := "success" + workflowName := "Build and test" + jobName := "Test" + event := github.WorkflowJobEvent{ Action: &action, Repo: &github.Repository{ @@ -138,6 +141,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobQueuedEvent(t *testing.T) Status: &status, Conclusion: &conclusion, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "/anything", "workflow_job", event) @@ -150,11 +155,13 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobQueuedEvent(t *testing.T) assert.Equal(t, http.StatusAccepted, res.Result().StatusCode) observer.assertNoWorkflowJobDurationObservation(1 * time.Second) observer.assertWorkflowJobStatusCount(workflowJobStatusCount{ - org: org, - repo: repo, - status: action, - conclusion: conclusion, - runnerGroup: runnerGroupName, + org: org, + repo: repo, + status: action, + conclusion: conclusion, + runnerGroup: runnerGroupName, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) } @@ -177,6 +184,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobInProgressEvent(t *testing runnerGroupName := "runner-group" action := "in_progress" status := "in_progress" + workflowName := "Build and test" + jobName := "Test" event := github.WorkflowJobEvent{ Action: &action, @@ -198,6 +207,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobInProgressEvent(t *testing }, }, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "/anything", "workflow_job", event) @@ -209,18 +220,22 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobInProgressEvent(t *testing // Then assert.Equal(t, http.StatusAccepted, res.Result().StatusCode) observer.assertWorkflowJobObservation(workflowJobObservation{ - org: org, - repo: repo, - state: "queued", - runnerGroup: runnerGroupName, - seconds: expectedDuration, + org: org, + repo: repo, + state: "queued", + runnerGroup: runnerGroupName, + seconds: expectedDuration, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) observer.assertWorkflowJobStatusCount(workflowJobStatusCount{ - org: org, - repo: repo, - runnerGroup: runnerGroupName, - status: action, - conclusion: "", + org: org, + repo: repo, + runnerGroup: runnerGroupName, + status: action, + conclusion: "", + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) } @@ -243,6 +258,8 @@ func Test_WorkflowMetricsExporter_HandleGHWebHook_WorkflowJobInProgressEventWith runnerGroupName := "runner-group" action := "in_progress" status := "in_progress" + workflowName := "Build and test" + jobName := "Test" event := github.WorkflowJobEvent{ Action: &action, @@ -264,6 +281,8 @@ func Test_WorkflowMetricsExporter_HandleGHWebHook_WorkflowJobInProgressEventWith }, }, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "/anything", "workflow_job", event) @@ -275,18 +294,22 @@ func Test_WorkflowMetricsExporter_HandleGHWebHook_WorkflowJobInProgressEventWith // Then assert.Equal(t, http.StatusAccepted, res.Result().StatusCode) observer.assertWorkflowJobObservation(workflowJobObservation{ - org: org, - repo: repo, - state: "queued", - runnerGroup: runnerGroupName, - seconds: 0, + org: org, + repo: repo, + state: "queued", + runnerGroup: runnerGroupName, + workflowName: workflowName, + jobName: jobName, + seconds: 0, }, 50*time.Millisecond) observer.assertWorkflowJobStatusCount(workflowJobStatusCount{ - org: org, - repo: repo, - runnerGroup: runnerGroupName, - status: action, - conclusion: "", + org: org, + repo: repo, + runnerGroup: runnerGroupName, + status: action, + conclusion: "", + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) } @@ -310,6 +333,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent(t *testing. action := "completed" status := "completed" conclusion := "success" + workflowName := "Build and test" + jobName := "Test" event := github.WorkflowJobEvent{ Action: &action, @@ -325,6 +350,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent(t *testing. Status: &status, Conclusion: &conclusion, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "/anything", "workflow_job", event) @@ -336,26 +363,32 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent(t *testing. // Then assert.Equal(t, http.StatusAccepted, res.Result().StatusCode) observer.assertWorkflowJobObservation(workflowJobObservation{ - org: org, - repo: repo, - state: "in_progress", - runnerGroup: runnerGroupName, - seconds: expectedDuration, + org: org, + repo: repo, + state: "in_progress", + runnerGroup: runnerGroupName, + seconds: expectedDuration, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) observer.assertWorkflowJobStatusCount(workflowJobStatusCount{ - org: org, - repo: repo, - runnerGroup: runnerGroupName, - status: status, - conclusion: conclusion, + org: org, + repo: repo, + runnerGroup: runnerGroupName, + status: status, + conclusion: conclusion, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) observer.assertWorkflowJobDurationCount(workflowJobDurationCount{ - org: org, - repo: repo, - runnerGroup: runnerGroupName, - status: status, - conclusion: conclusion, - seconds: expectedDuration, + org: org, + repo: repo, + runnerGroup: runnerGroupName, + status: status, + conclusion: conclusion, + seconds: expectedDuration, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) } @@ -377,6 +410,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent_WithNoStart action := "completed" status := "completed" conclusion := "success" + workflowName := "Build and test" + jobName := "Test" event := github.WorkflowJobEvent{ Action: &action, @@ -391,6 +426,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent_WithNoStart Conclusion: &conclusion, Status: &status, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "/anything", "workflow_job", event) @@ -402,11 +439,13 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent_WithNoStart // Then assert.Equal(t, http.StatusAccepted, res.Result().StatusCode) observer.assertWorkflowJobStatusCount(workflowJobStatusCount{ - org: org, - repo: repo, - runnerGroup: runnerGroupName, - status: status, - conclusion: conclusion, + org: org, + repo: repo, + runnerGroup: runnerGroupName, + status: status, + conclusion: conclusion, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) } @@ -427,6 +466,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent_WithNoCompl action := "completed" status := "completed" conclusion := "success" + workflowName := "Build and test" + jobName := "Test" event := github.WorkflowJobEvent{ Action: &action, @@ -441,6 +482,8 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent_WithNoCompl Conclusion: &conclusion, Status: &status, RunnerGroupName: &runnerGroupName, + WorkflowName: &workflowName, + Name: &jobName, }, } req := testWebhookRequest(t, "/anything", "workflow_job", event) @@ -452,11 +495,13 @@ func Test_GHActionExporter_HandleGHWebHook_WorkflowJobCompletedEvent_WithNoCompl // Then assert.Equal(t, http.StatusAccepted, res.Result().StatusCode) observer.assertWorkflowJobStatusCount(workflowJobStatusCount{ - org: org, - repo: repo, - runnerGroup: runnerGroupName, - status: status, - conclusion: conclusion, + org: org, + repo: repo, + runnerGroup: runnerGroupName, + status: status, + conclusion: conclusion, + workflowName: workflowName, + jobName: jobName, }, 50*time.Millisecond) } @@ -589,16 +634,16 @@ func addValidSignatureHeader(t *testing.T, req *http.Request, payload []byte) { } type workflowJobObservation struct { - org, repo, state, runnerGroup string - seconds float64 + org, repo, state, runnerGroup, workflowName, jobName string + seconds float64 } type workflowJobStatusCount struct { - org, repo, status, conclusion, runnerGroup string + org, repo, status, conclusion, runnerGroup, workflowName, jobName string } type workflowJobDurationCount struct { - org, repo, status, conclusion, runnerGroup string - seconds float64 + org, repo, status, conclusion, runnerGroup, workflowName, jobName string + seconds float64 } type workflowRunObservation struct { @@ -632,34 +677,40 @@ func NewTestPrometheusObserver(t *testing.T) *TestPrometheusObserver { } } -func (o *TestPrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64) { +func (o *TestPrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64) { o.workFlowJobDurationObserved <- workflowJobObservation{ - org: org, - repo: repo, - state: state, - runnerGroup: runnerGroup, - seconds: seconds, + org: org, + repo: repo, + state: state, + runnerGroup: runnerGroup, + workflowName: workflowName, + jobName: jobName, + seconds: seconds, } } -func (o *TestPrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string) { +func (o *TestPrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string) { o.workflowJobStatusCounted <- workflowJobStatusCount{ - org: org, - repo: repo, - status: status, - conclusion: conclusion, - runnerGroup: runnerGroup, + org: org, + repo: repo, + status: status, + conclusion: conclusion, + runnerGroup: runnerGroup, + workflowName: workflowName, + jobName: jobName, } } -func (o *TestPrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64) { +func (o *TestPrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64) { o.workflowJobDurationCounted <- workflowJobDurationCount{ - org: org, - repo: repo, - status: status, - conclusion: conclusion, - runnerGroup: runnerGroup, - seconds: seconds, + org: org, + repo: repo, + status: status, + conclusion: conclusion, + runnerGroup: runnerGroup, + workflowName: workflowName, + jobName: jobName, + seconds: seconds, } }