Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-23.2: cli: enable collection of job traces in debug zip #113172

Merged
merged 1 commit into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pkg/cli/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,9 @@ func setZipContextDefaults() {
// which impacts performance and SQL service latency.
zipCtx.includeStacks = true
// Job traces for running Traceable jobs involves fetching cluster wide traces
// for each job.
zipCtx.includeRunningJobTraces = false
// for each job. The number of such jobs is expected to be small, and so this
// flag is opt-out, not opt-in.
zipCtx.includeRunningJobTraces = true
zipCtx.cpuProfDuration = 5 * time.Second
zipCtx.concurrency = 15

Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[node 1] node status... writing JSON output: debug/nodes/1/status.json... done
[node 1] using SQL connection URL: postgresql://...
[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done
Expand Down Expand Up @@ -263,4 +264,3 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[node 1] node status... writing JSON output: debug/nodes/1/status.json... done
[node 1] using SQL connection URL: postgresql://...
[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done
Expand Down Expand Up @@ -166,4 +167,3 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[node 1] node status... writing JSON output: debug/nodes/1/status.json... done
[node 1] using SQL connection URL: postgresql://...
[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done
Expand Down Expand Up @@ -165,4 +166,3 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -126,4 +127,3 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
4 changes: 3 additions & 1 deletion pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
zip
----
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
[cluster] collecting the inflight traces for jobs...
[cluster] collecting the inflight traces for jobs: done
[cluster] collecting the inflight traces for jobs: received response...
[cluster] creating output file /dev/null...
[cluster] creating output file /dev/null: done
[cluster] discovering virtual clusters...
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/testzip_exclude_goroutine_stacks
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-goroutine-stacks=f
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -125,5 +126,4 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-goroutine-stacks=f
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
[cluster] NOTE: Omitted node-level goroutine stack dumps from this debug zip bundle. Use the --include-goroutine-stacks flag to enable the fetching of this data.
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -161,4 +162,3 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/testzip_include_goroutine_stacks
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -126,4 +127,3 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
2 changes: 1 addition & 1 deletion pkg/cli/testdata/zip/testzip_include_range_info
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info /dev/nu
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -126,4 +127,3 @@ debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info /dev/nu
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
3 changes: 2 additions & 1 deletion pkg/cli/testdata/zip/testzip_shared_process_virtualization
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -227,6 +228,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/cluster/test-tenant/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/cluster/test-tenant/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -286,4 +288,3 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -227,6 +228,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] requesting tenant ranges... received response...
[cluster] requesting tenant ranges: last request failed: rpc error: ...
[cluster] requesting tenant ranges: creating error output: debug/cluster/test-tenant/tenant_ranges.err.txt... done
[cluster] collecting the inflight traces for jobs... received response... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1... writing binary output: debug/cluster/test-tenant/nodes/1/cpu.pprof... done
Expand Down Expand Up @@ -286,4 +288,3 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] pprof summary script... writing binary output: debug/cluster/test-tenant/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges.sh... done
[cluster] tenant hot range summary script... writing binary output: debug/cluster/test-tenant/hot-ranges-tenant.sh... done
[cluster] NOTE: Omitted traces of running jobs from this debug zip bundle. Use the --include-running-job-traces flag to enable the fetching of this data.
16 changes: 7 additions & 9 deletions pkg/cli/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,7 @@ type jobTrace struct {
// dumpTraceableJobTraces collects the traces for some "traceable" jobs that are
// in a running state. The job types in this list are the ones that have
// explicitly implemented the TraceableJob interface.
func (zc *debugZipContext) dumpTraceableJobTraces() error {
ctx := context.Background()
func (zc *debugZipContext) dumpTraceableJobTraces(ctx context.Context) error {
rows, err := zc.firstNodeSQLConn.Query(ctx,
`WITH
latestprogress AS (
Expand Down Expand Up @@ -426,13 +425,6 @@ INNER JOIN latestprogress ON j.id = latestprogress.job_id;`,
if err != nil {
return err
}
defer func() {
if rows != nil {
if err := rows.Close(); err != nil {
log.Warningf(ctx, "failed to close with error: %v", err)
}
}
}()
vals := make([]driver.Value, 2)
jobTraces := make([]jobTrace, 0)
for err = rows.Next(vals); err == nil; err = rows.Next(vals) {
Expand All @@ -450,6 +442,12 @@ INNER JOIN latestprogress ON j.id = latestprogress.job_id;`,
}
jobTraces = append(jobTraces, jobTrace{jobID: jobspb.JobID(jobID), traceID: progress.TraceID})
}
if err != io.EOF {
return err
}
if err := rows.Close(); err != nil {
return err
}

func() {
// Debug zip collection sets this to false since results from the query are
Expand Down
12 changes: 9 additions & 3 deletions pkg/cli/zip_cluster_wide.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,15 @@ func (zc *debugZipContext) collectClusterData(
}

if zipCtx.includeRunningJobTraces {
zc.clusterPrinter.info("collecting the inflight traces for jobs")
if err := zc.dumpTraceableJobTraces(); err != nil {
return &serverpb.NodesListResponse{}, nil, err
s := zc.clusterPrinter.start("collecting the inflight traces for jobs")
if requestErr := zc.runZipFn(ctx, s, func(ctx context.Context) error {
return zc.dumpTraceableJobTraces(ctx)
}); requestErr != nil {
if err := zc.z.createError(s, zc.prefix+"/jobs", requestErr); err != nil {
return &serverpb.NodesListResponse{}, nil, s.fail(err)
}
} else {
s.done()
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/zip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ func TestZipJobTrace(t *testing.T) {
timeout: 3 * time.Second,
firstNodeSQLConn: sqlConn,
}
if err := zc.dumpTraceableJobTraces(); err != nil {
if err := zc.dumpTraceableJobTraces(context.Background()); err != nil {
t.Fatal(err)
}
}()
Expand Down