Skip to content

Commit

Permalink
Merge #124409
Browse files Browse the repository at this point in the history
124409: roachprod: add custom tags for prometheus config r=srosenberg a=nameisbhaskar

few of the labels are missing in prometheus with the change from "gce_sd_configs" to "file_sd_configs". This is because these were extracted from the VM metadata which is not available now. So, the same values are set from roachprod now.

Informs: #124319
Epic: none

Co-authored-by: Bhaskarjyoti Bora <bhaskar.bora@cockroachlabs.com>
  • Loading branch information
craig[bot] and nameisbhaskar committed May 21, 2024
2 parents a7aad51 + 9e30f6f commit d146ecf
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 9 deletions.
18 changes: 15 additions & 3 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (c *PromClient) UpdatePrometheusTargets(
ctx context.Context,
promUrl, clusterName string,
forceFetchCreds bool,
nodes map[int]string,
nodes map[int]*NodeInfo,
insecure bool,
l *logger.Logger,
) error {
Expand Down Expand Up @@ -157,17 +157,29 @@ type CCParams struct {
Labels map[string]string `yaml:"labels"`
}

// NodeInfo contains the target and labels for the node
type NodeInfo struct {
Target string // Name of the node
CustomLabels map[string]string // Custom labels to be added to the cluster config
}

// createClusterConfigFile creates the cluster config file per node
func buildCreateRequest(nodes map[int]string, insecure bool) (io.Reader, error) {
func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, error) {
configs := make([]*CCParams, 0)
for i, n := range nodes {
params := &CCParams{
Targets: []string{n},
Targets: []string{n.Target},
Labels: map[string]string{
// default labels
"node": strconv.Itoa(i),
"tenant": install.SystemInterfaceName,
"job": "cockroachdb",
},
}
// custom labels - this can override the default labels if needed
for n, v := range n.CustomLabels {
params.Labels[n] = v
}
configs = append(configs, params)
}
cb, err := yaml.Marshal(&configs)
Expand Down
14 changes: 11 additions & 3 deletions pkg/roachprod/promhelperclient/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,16 @@ func TestUpdatePrometheusTargets(t *testing.T) {
Body: io.NopCloser(strings.NewReader("failed")),
}, nil
}
err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, map[int]string{1: "n1"}, true, l)
err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false,
map[int]*NodeInfo{1: {Target: "n1"}}, true, l)
require.NotNil(t, err)
require.Equal(t, "request failed with status 400 and error failed", err.Error())
})
t.Run("UpdatePrometheusTargets succeeds", func(t *testing.T) {
nodeInfos := map[int]string{1: "n1", 3: "n3"}
nodeInfos := map[int]*NodeInfo{1: {Target: "n1"}, 3: {
Target: "n3",
CustomLabels: map[string]string{"custom": "label"},
}}
c.httpPut = func(ctx context.Context, url string, h *http.Header, body io.Reader) (
resp *http.Response, err error) {
require.Equal(t, getUrl(promUrl, "c1"), url)
Expand All @@ -67,8 +71,12 @@ func TestUpdatePrometheusTargets(t *testing.T) {
for _, c := range configs {
nodeID, err := strconv.Atoi(c.Labels["node"])
require.NoError(t, err)
require.Equal(t, nodeInfos[nodeID], c.Targets[0])
require.Equal(t, nodeInfos[nodeID].Target, c.Targets[0])
require.Equal(t, "system", c.Labels["tenant"])
require.Equal(t, "cockroachdb", c.Labels["job"])
for k, v := range nodeInfos[nodeID].CustomLabels {
require.Equal(t, v, c.Labels[k])
}
}
return &http.Response{
StatusCode: 200,
Expand Down
32 changes: 29 additions & 3 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ func UpdateTargets(

// updatePrometheusTargets updates the prometheus instance cluster config. Any error is logged and ignored.
func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.SyncedCluster) {
nodeIPPorts := make(map[int]string)
nodeIPPorts := make(map[int]*promhelperclient.NodeInfo)
nodeIPPortsMutex := syncutil.RWMutex{}
var wg sync.WaitGroup
for _, node := range c.Nodes {
Expand All @@ -808,10 +808,10 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S
l.Errorf("error getting the port for node %d: %v", index, err)
return
}
nodeInfo := fmt.Sprintf("%s:%d", v.PublicIP, desc.Port)
nodeInfo := fmt.Sprintf("%s:%d", v.PrivateIP, desc.Port)
nodeIPPortsMutex.Lock()
// ensure atomicity in map update
nodeIPPorts[index] = nodeInfo
nodeIPPorts[index] = &promhelperclient.NodeInfo{Target: nodeInfo, CustomLabels: getLabels(v)}
nodeIPPortsMutex.Unlock()
}(int(node), c.VMs[node-1])
}
Expand All @@ -826,6 +826,32 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S
}
}

// regionRegEx is the regex to extract the region label from zone available as vm property
var regionRegEx = regexp.MustCompile("(^.+[0-9]+)(-[a-f]$)")

// getLabels returns the labels to be populated in the target configuration in prometheus
func getLabels(v vm.VM) map[string]string {
labels := map[string]string{
"cluster": v.Labels["cluster"],
"instance": v.Name,
"host_ip": v.PrivateIP,
"project": v.Project,
"zone": v.Zone,
}
match := regionRegEx.FindStringSubmatch(v.Zone)
if len(match) > 1 {
labels["region"] = match[1]
}
// the following labels are present if the test labels are added before the VM is started
if t, ok := v.Labels["test_name"]; ok {
labels["test_name"] = t
}
if t, ok := v.Labels["test_run_id"]; ok {
labels["test_run_id"] = t
}
return labels
}

// Monitor monitors the status of cockroach nodes in a cluster.
func Monitor(
ctx context.Context, l *logger.Logger, clusterName string, opts install.MonitorOpts,
Expand Down

0 comments on commit d146ecf

Please sign in to comment.