Skip to content

Commit

Permalink
Merge #119205
Browse files Browse the repository at this point in the history
119205: cli: --include-range-info flag for `cockroach debug zip` also toggles problem ranges r=abarganier a=nkodali


Previously, the --include-range-info flag only toggled inclusion of one file per node with information on KV ranges on that node. This change also toggles inclusion of problem ranges with the same flag, as there is suspected cluster performance degradation for large clusters when fetching problem ranges.

Fixes: #118991

Release note (ops change): Expanded --include-range-info flag to additionally include problem ranges. This is still defaulted to true.

Co-authored-by: Namrata Kodali <namrata@cockroachlabs.com>
  • Loading branch information
craig[bot] and nkodali committed Feb 14, 2024
2 parents e548249 + 4b24149 commit cc6ca02
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 12 deletions.
19 changes: 10 additions & 9 deletions pkg/cli/cliflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -1643,15 +1643,16 @@ necessary to support CockroachDB.
ZipIncludeRangeInfo = FlagInfo{
Name: "include-range-info",
Description: `
Include one file per node with information about the KV ranges stored on that node,
in nodes/{node ID}/ranges.json. This information can be vital when debugging issues
that involve the KV storage layer, such as data placement, load balancing, performance
or other behaviors. In certain situations, on large clusters with large numbers of ranges,
these files can be omitted if and only if the issue being investigated is already known to
be in another layer of the system (for example, an error message about an unsupported
feature or incompatible value in a SQL schema change or statement). Note however many
higher-level issues are ultimately related to the underlying KV storage layer described
by these files so only set this to false if directed to do so by Cockroach Labs support.
Include one file per node with information about the KV ranges stored on that node,
in nodes/{node ID}/ranges.json. Additionally, include problem ranges information.
This information can be vital when debugging issues that involve the KV storage layer,
such as data placement, load balancing, performance or other behaviors. In certain situations,
on large clusters with large numbers of ranges, these files can be omitted if and only if the
issue being investigated is already known to be in another layer of the system (for example,
an error message about an unsupported feature or incompatible value in a SQL schema change or
statement). Note however many higher-level issues are ultimately related to the underlying KV
storage layer described by these files so only set this to false if directed to do so by Cockroach
Labs support.
`,
}

Expand Down
122 changes: 122 additions & 0 deletions pkg/cli/testdata/zip/testzip_exclude_range_info
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
zip
----
debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info=false /dev/null
[cluster] discovering virtual clusters... done
[cluster] creating output file /dev/null... done
[cluster] establishing RPC connection to ...
[cluster] using SQL address: ...
[cluster] requesting data for debug/events... received response... writing JSON output: debug/events.json... done
[cluster] requesting data for debug/rangelog... received response... writing JSON output: debug/rangelog.json... done
[cluster] requesting data for debug/settings... received response... writing JSON output: debug/settings.json... done
[cluster] retrieving SQL data for "".crdb_internal.create_function_statements... writing output: debug/crdb_internal.create_function_statements.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_procedure_statements... writing output: debug/crdb_internal.create_procedure_statements.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_schema_statements... writing output: debug/crdb_internal.create_schema_statements.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_statements... writing output: debug/crdb_internal.create_statements.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_type_statements... writing output: debug/crdb_internal.create_type_statements.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_settings... writing output: debug/crdb_internal.cluster_settings.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_transactions... writing output: debug/crdb_internal.cluster_transactions.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_txn_execution_insights... writing output: debug/crdb_internal.cluster_txn_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.default_privileges... writing output: debug/crdb_internal.default_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.jobs... writing output: debug/crdb_internal.jobs.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_node_liveness... writing output: debug/crdb_internal.kv_node_liveness.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_node_status... writing output: debug/crdb_internal.kv_node_status.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_protected_ts_records... writing output: debug/crdb_internal.kv_protected_ts_records.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_store_status... writing output: debug/crdb_internal.kv_store_status.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_system_privileges... writing output: debug/crdb_internal.kv_system_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.partitions... writing output: debug/crdb_internal.partitions.txt... done
[cluster] retrieving SQL data for crdb_internal.probe_ranges_1s_read_limit_100... writing output: debug/crdb_internal.probe_ranges_1s_read_limit_100.txt... done
[cluster] retrieving SQL data for crdb_internal.regions... writing output: debug/crdb_internal.regions.txt... done
[cluster] retrieving SQL data for crdb_internal.schema_changes... writing output: debug/crdb_internal.schema_changes.txt... done
[cluster] retrieving SQL data for crdb_internal.super_regions... writing output: debug/crdb_internal.super_regions.txt... done
[cluster] retrieving SQL data for crdb_internal.system_jobs... writing output: debug/crdb_internal.system_jobs.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.zones... writing output: debug/crdb_internal.zones.txt... done
[cluster] retrieving SQL data for system.database_role_settings... writing output: debug/system.database_role_settings.txt... done
[cluster] retrieving SQL data for system.descriptor... writing output: debug/system.descriptor.txt... done
[cluster] retrieving SQL data for system.eventlog... writing output: debug/system.eventlog.txt... done
[cluster] retrieving SQL data for system.external_connections... writing output: debug/system.external_connections.txt... done
[cluster] retrieving SQL data for system.job_info... writing output: debug/system.job_info.txt... done
[cluster] retrieving SQL data for system.jobs... writing output: debug/system.jobs.txt... done
[cluster] retrieving SQL data for system.lease... writing output: debug/system.lease.txt... done
[cluster] retrieving SQL data for system.locations... writing output: debug/system.locations.txt... done
[cluster] retrieving SQL data for system.migrations... writing output: debug/system.migrations.txt... done
[cluster] retrieving SQL data for system.namespace... writing output: debug/system.namespace.txt... done
[cluster] retrieving SQL data for system.privileges... writing output: debug/system.privileges.txt... done
[cluster] retrieving SQL data for system.protected_ts_meta... writing output: debug/system.protected_ts_meta.txt... done
[cluster] retrieving SQL data for system.protected_ts_records... writing output: debug/system.protected_ts_records.txt... done
[cluster] retrieving SQL data for system.rangelog... writing output: debug/system.rangelog.txt... done
[cluster] retrieving SQL data for system.replication_constraint_stats... writing output: debug/system.replication_constraint_stats.txt... done
[cluster] retrieving SQL data for system.replication_critical_localities... writing output: debug/system.replication_critical_localities.txt... done
[cluster] retrieving SQL data for system.replication_stats... writing output: debug/system.replication_stats.txt... done
[cluster] retrieving SQL data for system.reports_meta... writing output: debug/system.reports_meta.txt... done
[cluster] retrieving SQL data for system.role_id_seq... writing output: debug/system.role_id_seq.txt... done
[cluster] retrieving SQL data for system.role_members... writing output: debug/system.role_members.txt... done
[cluster] retrieving SQL data for system.role_options... writing output: debug/system.role_options.txt... done
[cluster] retrieving SQL data for system.scheduled_jobs... writing output: debug/system.scheduled_jobs.txt... done
[cluster] retrieving SQL data for system.settings... writing output: debug/system.settings.txt... done
[cluster] retrieving SQL data for system.span_configurations... writing output: debug/system.span_configurations.txt... done
[cluster] retrieving SQL data for system.sql_instances... writing output: debug/system.sql_instances.txt... done
[cluster] retrieving SQL data for system.sqlliveness... writing output: debug/system.sqlliveness.txt... done
[cluster] retrieving SQL data for system.statement_diagnostics... writing output: debug/system.statement_diagnostics.txt... done
[cluster] retrieving SQL data for system.statement_diagnostics_requests... writing output: debug/system.statement_diagnostics_requests.txt... done
[cluster] retrieving SQL data for system.statement_statistics_limit_5000... writing output: debug/system.statement_statistics_limit_5000.txt... done
[cluster] retrieving SQL data for system.table_statistics... writing output: debug/system.table_statistics.txt... done
[cluster] retrieving SQL data for system.task_payloads... writing output: debug/system.task_payloads.txt... done
[cluster] retrieving SQL data for system.tenant_settings... writing output: debug/system.tenant_settings.txt... done
[cluster] retrieving SQL data for system.tenant_tasks... writing output: debug/system.tenant_tasks.txt... done
[cluster] retrieving SQL data for system.tenant_usage... writing output: debug/system.tenant_usage.txt... done
[cluster] retrieving SQL data for system.tenants... writing output: debug/system.tenants.txt... done
[cluster] requesting nodes... received response... writing JSON output: debug/nodes.json... done
[cluster] requesting liveness... received response... writing JSON output: debug/liveness.json... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
[node 1] node status... writing JSON output: debug/nodes/1/status.json... done
[node 1] using SQL connection URL: postgresql://...
[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.kv_session_based_leases... writing output: debug/nodes/1/crdb_internal.kv_session_based_leases.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
[node 1] retrieving SQL data for crdb_internal.node_contention_events... writing output: debug/nodes/1/crdb_internal.node_contention_events.txt... done
[node 1] retrieving SQL data for crdb_internal.node_distsql_flows... writing output: debug/nodes/1/crdb_internal.node_distsql_flows.txt... done
[node 1] retrieving SQL data for crdb_internal.node_execution_insights... writing output: debug/nodes/1/crdb_internal.node_execution_insights.txt... done
[node 1] retrieving SQL data for crdb_internal.node_inflight_trace_spans... writing output: debug/nodes/1/crdb_internal.node_inflight_trace_spans.txt... done
[node 1] retrieving SQL data for crdb_internal.node_memory_monitors... writing output: debug/nodes/1/crdb_internal.node_memory_monitors.txt... done
[node 1] retrieving SQL data for crdb_internal.node_metrics... writing output: debug/nodes/1/crdb_internal.node_metrics.txt... done
[node 1] retrieving SQL data for crdb_internal.node_queries... writing output: debug/nodes/1/crdb_internal.node_queries.txt... done
[node 1] retrieving SQL data for crdb_internal.node_runtime_info... writing output: debug/nodes/1/crdb_internal.node_runtime_info.txt... done
[node 1] retrieving SQL data for crdb_internal.node_sessions... writing output: debug/nodes/1/crdb_internal.node_sessions.txt... done
[node 1] retrieving SQL data for crdb_internal.node_statement_statistics... writing output: debug/nodes/1/crdb_internal.node_statement_statistics.txt... done
[node 1] retrieving SQL data for crdb_internal.node_tenant_capabilities_cache... writing output: debug/nodes/1/crdb_internal.node_tenant_capabilities_cache.txt... done
[node 1] retrieving SQL data for crdb_internal.node_transaction_statistics... writing output: debug/nodes/1/crdb_internal.node_transaction_statistics.txt... done
[node 1] retrieving SQL data for crdb_internal.node_transactions... writing output: debug/nodes/1/crdb_internal.node_transactions.txt... done
[node 1] retrieving SQL data for crdb_internal.node_txn_execution_insights... writing output: debug/nodes/1/crdb_internal.node_txn_execution_insights.txt... done
[node 1] retrieving SQL data for crdb_internal.node_txn_stats... writing output: debug/nodes/1/crdb_internal.node_txn_stats.txt... done
[node 1] requesting data for debug/nodes/1/details... received response... writing JSON output: debug/nodes/1/details.json... done
[node 1] requesting data for debug/nodes/1/gossip... received response... writing JSON output: debug/nodes/1/gossip.json... done
[node 1] requesting data for debug/nodes/1/enginestats... received response... writing JSON output: debug/nodes/1/enginestats.json... done
[node 1] requesting stacks... received response... writing binary output: debug/nodes/1/stacks.txt... done
[node 1] requesting stacks with labels... received response... writing binary output: debug/nodes/1/stacks_with_labels.txt... done
[node 1] requesting heap profile... received response... writing binary output: debug/nodes/1/heap.pprof... done
[node 1] requesting heap file list... received response... done
[node ?] ? heap profiles found
[node 1] requesting goroutine dump list... received response... done
[node ?] ? goroutine dumps found
[node 1] requesting log files list... received response... done
[node ?] ? log files found
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
9 changes: 6 additions & 3 deletions pkg/cli/zip_cluster_wide.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ const (
func makeClusterWideZipRequests(
admin serverpb.AdminClient, status serverpb.StatusClient, prefix string,
) []zipRequest {
return []zipRequest{
zipRequests := []zipRequest{
// NB: we intentionally omit liveness since it's already pulled manually (we
// act on the output to special case decommissioned nodes).
{
Expand All @@ -62,13 +62,16 @@ func makeClusterWideZipRequests(
},
pathName: prefix + settingsName,
},
{
}
if zipCtx.includeRangeInfo {
zipRequests = append(zipRequests, zipRequest{
fn: func(ctx context.Context) (interface{}, error) {
return status.ProblemRanges(ctx, &serverpb.ProblemRangesRequest{})
},
pathName: prefix + problemRangesName,
},
})
}
return zipRequests
}

// collectClusterData runs the data collection that only needs to
Expand Down
34 changes: 34 additions & 0 deletions pkg/cli/zip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,40 @@ func TestZipIncludeRangeInfo(t *testing.T) {
)
}

// This tests the operation of zip using --include-range-info=false.
func TestZipExcludeRangeInfo(t *testing.T) {
defer leaktest.AfterTest(t)()

skip.UnderRace(t, "test too slow under race")

dir, cleanupFn := testutils.TempDir(t)
defer cleanupFn()

c := NewCLITest(TestCLIParams{
StoreSpecs: []base.StoreSpec{{
Path: dir,
}},
})
defer c.Cleanup()

out, err := c.RunWithCapture(
"debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info=false " + os.DevNull)
if err != nil {
t.Fatal(err)
}

// Strip any non-deterministic messages.
out = eraseNonDeterministicZipOutput(out)

// We use datadriven simply to read the golden output file; we don't actually
// run any commands. Using datadriven allows TESTFLAGS=-rewrite.
datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "testzip_exclude_range_info"),
func(t *testing.T, td *datadriven.TestData) string {
return out
},
)
}

// This tests the operation of zip running concurrently.
func TestConcurrentZip(t *testing.T) {
defer leaktest.AfterTest(t)()
Expand Down

0 comments on commit cc6ca02

Please sign in to comment.