Skip to content

Commit

Permalink
Extend workflow job run metric (#167)
Browse files Browse the repository at this point in the history
Signed-off-by: peterhalasz <peter.halasz@live.com>
  • Loading branch information
peterhalasz committed Feb 6, 2024
1 parent 6129f95 commit 08d40ff
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 103 deletions.
25 changes: 13 additions & 12 deletions internal/server/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ var (
Help: "Time that a workflow job took to reach a given state.",
Buckets: prometheus.ExponentialBuckets(1, 1.4, 30),
},
[]string{"org", "repo", "state", "runner_group"},
[]string{"org", "repo", "state", "runner_group", "workflow_name", "job_name"},
)

workflowJobDurationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "workflow_job_duration_seconds_total",
Help: "The total duration of jobs.",
},
[]string{"org", "repo", "status", "conclusion", "runner_group"},
[]string{"org", "repo", "status", "conclusion", "runner_group", "workflow_name", "job_name"},
)

workflowJobStatusCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "workflow_job_status_count",
Help: "Count of workflow job events.",
},
[]string{"org", "repo", "status", "conclusion", "runner_group"},
[]string{"org", "repo", "status", "conclusion", "runner_group", "workflow_name", "job_name"},
)

workflowRunHistogramVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Expand Down Expand Up @@ -83,9 +83,10 @@ func init() {
}

type WorkflowObserver interface {
ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64)
CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string)
CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64)
ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64)
CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string)
CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64)

ObserveWorkflowRunDuration(org, repo, workflow, conclusion string, seconds float64)
CountWorkflowRunStatus(org, repo, status, conclusion, workflow string)
}
Expand All @@ -94,17 +95,17 @@ var _ WorkflowObserver = (*PrometheusObserver)(nil)

type PrometheusObserver struct{}

func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64) {
workflowJobHistogramVec.WithLabelValues(org, repo, state, runnerGroup).
func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64) {
workflowJobHistogramVec.WithLabelValues(org, repo, state, runnerGroup, workflowName, jobName).
Observe(seconds)
}

func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string) {
workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Inc()
func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string) {
workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup, workflowName, jobName).Inc()
}

func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64) {
workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Add(seconds)
func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64) {
workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup, workflowName, jobName).Add(seconds)
}

func (o *PrometheusObserver) ObserveWorkflowRunDuration(org, repo, workflowName, conclusion string, seconds float64) {
Expand Down
8 changes: 6 additions & 2 deletions internal/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
jobStartedAt := time.Unix(1650308740, 0)
completedAt := jobStartedAt.Add(time.Duration(expectedDuration) * time.Second)
runnerGroupName := "runner-group"
workflowName := "Build and test"
jobName := "Test"

event := github.WorkflowJobEvent{
Action: github.String("completed"),
Expand All @@ -107,6 +109,8 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
StartedAt: &github.Timestamp{Time: jobStartedAt},
CompletedAt: &github.Timestamp{Time: completedAt},
RunnerGroupName: &runnerGroupName,
WorkflowName: &workflowName,
Name: &jobName,
},
}
req := testWebhookRequest(t, "http://localhost:8001/webhook", "workflow_job", event)
Expand All @@ -125,6 +129,6 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {

payload, err := io.ReadAll(metricsRes.Body)
require.NoError(t, err)
assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",le="10.541350399999995"} 1`)
assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",org="someone",repo="some-repo",runner_group="runner-group",status="completed"} 10`)
assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{job_name="Test",org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",workflow_name="Build and test",le="10.541350399999995"} 1`)
assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",job_name="Test",org="someone",repo="some-repo",runner_group="runner-group",status="completed",workflow_name="Build and test"} 10`)
}
37 changes: 23 additions & 14 deletions internal/server/workflow_metrics_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,13 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http
return
case "workflow_job":
event := model.WorkflowJobEventFromJSON(io.NopCloser(bytes.NewBuffer(buf)))
_ = level.Info(c.Logger).Log("msg", "got workflow_job event", "org", event.GetRepo().GetOwner().GetLogin(), "repo", event.GetRepo().GetName(), "runId", event.GetWorkflowJob().GetRunID(), "action", event.GetAction())
_ = level.Info(c.Logger).Log("msg", "got workflow_job event",
"org", event.GetRepo().GetOwner().GetLogin(),
"repo", event.GetRepo().GetName(),
"runId", event.GetWorkflowJob().GetRunID(),
"action", event.GetAction(),
"workflow_name", event.GetWorkflowJob().GetWorkflowName(),
"job_name", event.GetWorkflowJob().GetName())
go c.CollectWorkflowJobEvent(event)
case "workflow_run":
event := model.WorkflowRunEventFromJSON(io.NopCloser(bytes.NewBuffer(buf)))
Expand All @@ -93,37 +99,40 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http
func (c *WorkflowMetricsExporter) CollectWorkflowJobEvent(event *github.WorkflowJobEvent) {
repo := event.GetRepo().GetName()
org := event.GetRepo().GetOwner().GetLogin()
runnerGroup := event.WorkflowJob.GetRunnerGroupName()

action := event.GetAction()
conclusion := event.GetWorkflowJob().GetConclusion()
status := event.GetWorkflowJob().GetStatus()

workflowJob := event.GetWorkflowJob()
runnerGroup := workflowJob.GetRunnerGroupName()
conclusion := workflowJob.GetConclusion()
status := workflowJob.GetStatus()
workflowName := workflowJob.GetWorkflowName()
jobName := workflowJob.GetName()

switch action {
case "queued":
// Do nothing.
case "in_progress":

if len(event.WorkflowJob.Steps) == 0 {
if len(workflowJob.Steps) == 0 {
_ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of in_progress event as event has no steps")
break
}

firstStep := event.WorkflowJob.Steps[0]
queuedSeconds := firstStep.StartedAt.Time.Sub(event.WorkflowJob.StartedAt.Time).Seconds()
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, math.Max(0, queuedSeconds))
firstStep := workflowJob.Steps[0]
queuedSeconds := firstStep.StartedAt.Time.Sub(workflowJob.GetStartedAt().Time).Seconds()
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, workflowName, jobName, math.Max(0, queuedSeconds))
case "completed":
if event.WorkflowJob.StartedAt == nil || event.WorkflowJob.CompletedAt == nil {
if workflowJob.StartedAt == nil || workflowJob.CompletedAt == nil {
_ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of completed event steps are missing timestamps")
break
}

jobSeconds := math.Max(0, event.WorkflowJob.GetCompletedAt().Time.Sub(event.WorkflowJob.GetStartedAt().Time).Seconds())
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, jobSeconds)
c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, jobSeconds)
jobSeconds := math.Max(0, workflowJob.GetCompletedAt().Time.Sub(workflowJob.GetStartedAt().Time).Seconds())
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, workflowName, jobName, jobSeconds)
c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName, jobSeconds)
}

c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup)
c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName)
}

func (c *WorkflowMetricsExporter) CollectWorkflowRunEvent(event *github.WorkflowRunEvent) {
Expand Down
Loading

0 comments on commit 08d40ff

Please sign in to comment.