Skip to content

Commit

Permalink
pkg/sql/sqlstats: move failures out of stmt key/meta, into stats
Browse files Browse the repository at this point in the history
In the SQL stats feature, we consider whether the statement failed as
part of the statement key/statement fingerprint ID, which is used to
bucket statement statistics aggregations.

This contributes to the cardinality of statement statistics by a
constant factor of 2x. A statement that has failed will show up twice in
the SQL Activity page, once for the successful executions, and once for
the failed executions. Despite not being very useful, the contributions
it makes to cardinality are detrimental to the SQL activity feature, as
it has a non-negligible impact on data volume.

To improve upon this, we should remove the failure bool from the
statement key, and instead include failure counts as part of the
statistics object.

This patch does the following:
* Removes the failure boolean from the statement key
* Removes failure counts from the statement statistics metadata
* Moves the failure counts into the aggrgeated statement statistics
* Updates the Statement Details page to display "Failure Count" instead
  of "Failed?", with the associated numeric count instead of a bool val.

Release note (ui change): In the DB Console's SQL Activity page, the
Statement Details page will now show "Failure Count" in place of
"Failed?", showing the number of failed executions for the given
statement fingerprint instead of a "Yes"/"No" value. Furthermore,
previously in the SQL Activity table the same statement fingeprint
would appear twice - once for failed executions, and once for
successful executions. With these changes, we no longer consider
these to be separate rows. Instead, we combine them into a single
statement fingerprint.

Release note (sql change): A new column has been added to
`crdb_internal.node_statement_statistics`, `failure_count INT NOT NULL`.
It represents the number of recorded statement execution failures for
the given statement, as a new component of the overall statistics.
  • Loading branch information
abarganier committed Mar 11, 2024
1 parent c43d4f9 commit 3756c81
Show file tree
Hide file tree
Showing 27 changed files with 62 additions and 110 deletions.
19 changes: 5 additions & 14 deletions pkg/server/application_api/sql_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ func TestStatusAPIStatements(t *testing.T) {
// See if the statements returned are what we executed.
var statementsInResponse []string
for _, respStatement := range resp.Statements {
if respStatement.Key.KeyData.Failed {
if respStatement.Stats.FailureCount > 0 {
// We ignore failed statements here as the INSERT statement can fail and
// be automatically retried, confusing the test success check.
continue
Expand Down Expand Up @@ -753,7 +753,6 @@ func TestStatusAPICombinedStatementsWithFullScans(t *testing.T) {
count int
fullScan bool
distSQL bool
failed bool
}

// expectedStatementStatsMap maps the query response format to the associated
Expand All @@ -769,7 +768,6 @@ func TestStatusAPICombinedStatementsWithFullScans(t *testing.T) {
expectedStatementStatsMap[stmt.respQuery] = ExpectedStatementData{
fullScan: stmt.fullScan,
distSQL: stmt.distSQL,
failed: stmt.failed,
count: stmt.count,
}
}
Expand Down Expand Up @@ -798,7 +796,7 @@ func TestStatusAPICombinedStatementsWithFullScans(t *testing.T) {
// statements that were that were successfully executed by the test app
// to avoid counting such failures. If a statement that we expect to be
// successful is not found in the response, the test will fail later.
if respStatement.Key.KeyData.App == testAppName && !respStatement.Key.KeyData.Failed {
if respStatement.Key.KeyData.App == testAppName && respStatement.Stats.FailureCount == 0 {
actualResponseStatsMap[respStatement.Key.KeyData.Query] = respStatement
}
}
Expand All @@ -810,13 +808,11 @@ func TestStatusAPICombinedStatementsWithFullScans(t *testing.T) {
actualCount := respStatement.Stats.FirstAttemptCount
actualFullScan := respStatement.Key.KeyData.FullScan
actualDistSQL := respStatement.Key.KeyData.DistSQL
actualFailed := respStatement.Key.KeyData.Failed

stmtJSONString := responseToJSON(respStatement)

require.Equal(t, expectedData.fullScan, actualFullScan, "failed for respStatement: %v", stmtJSONString)
require.Equal(t, expectedData.distSQL, actualDistSQL, "failed for respStatement: %v", stmtJSONString)
require.Equal(t, expectedData.failed, actualFailed, "failed for respStatement: %v", stmtJSONString)
require.Equal(t, expectedData.count, int(actualCount), "failed for respStatement: %v", stmtJSONString)
}
}
Expand Down Expand Up @@ -912,7 +908,7 @@ func TestStatusAPICombinedStatements(t *testing.T) {
var statementsInResponse []string
expectedTxnFingerprints := map[appstatspb.TransactionFingerprintID]struct{}{}
for _, respStatement := range resp.Statements {
if respStatement.Key.KeyData.Failed {
if respStatement.Stats.FailureCount > 0 {
// We ignore failed statements here as the INSERT statement can fail and
// be automatically retried, confusing the test success check.
continue
Expand Down Expand Up @@ -1066,8 +1062,7 @@ func TestStatusAPIStatementDetails(t *testing.T) {
}

query := `INSERT INTO posts VALUES (_, '_')`
fingerprintID := appstatspb.ConstructStatementFingerprintID(query,
false, true, `roachblog`)
fingerprintID := appstatspb.ConstructStatementFingerprintID(query, true, `roachblog`)
path := fmt.Sprintf(`stmtdetails/%v`, fingerprintID)

var resp serverpb.StatementDetailsResponse
Expand Down Expand Up @@ -1260,8 +1255,7 @@ func TestStatusAPIStatementDetails(t *testing.T) {
}

selectQuery := "SELECT _, _, _, _"
fingerprintID = appstatspb.ConstructStatementFingerprintID(selectQuery, false,
true, "defaultdb")
fingerprintID = appstatspb.ConstructStatementFingerprintID(selectQuery, true, "defaultdb")

testPath(
fmt.Sprintf(`stmtdetails/%v`, fingerprintID),
Expand Down Expand Up @@ -1583,7 +1577,6 @@ func generateStatement() appstatspb.CollectedStatementStatistics {
Database: "test_database",
DistSQL: true,
FullScan: true,
Failed: false,
ImplicitTxn: true,
PlanHash: uint64(200),
Query: "SELECT * FROM foo",
Expand Down Expand Up @@ -1729,7 +1722,6 @@ func insertStatementIntoSystemStmtStatsTable(
metadata := struct {
Database string `json:"db"`
DistSQL bool `json:"distsql"`
Failed bool `json:"failed"`
FullScan bool `json:"fullScan"`
ImplicitTxn bool `json:"implicitTxn"`
Query string `json:"query"`
Expand All @@ -1739,7 +1731,6 @@ func insertStatementIntoSystemStmtStatsTable(
}{
Database: statement.Key.Database,
DistSQL: statement.Key.DistSQL,
Failed: statement.Key.Failed,
FullScan: statement.Key.FullScan,
ImplicitTxn: statement.Key.ImplicitTxn,
Query: statement.Key.Query,
Expand Down
14 changes: 5 additions & 9 deletions pkg/server/combined_statement_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -666,15 +666,14 @@ func collectCombinedStatements(
activityTableHasAllData bool,
) ([]serverpb.StatementsResponse_CollectedStatementStatistics, error) {
aostClause := testingKnobs.GetAOSTClause()
const expectedNumDatums = 10
const expectedNumDatums = 9
const queryFormat = `
SELECT
fingerprint_id,
app_name,
aggregated_ts,
COALESCE(CAST(metadata -> 'distSQLCount' AS INT), 0) AS distSQLCount,
COALESCE(CAST(metadata -> 'fullScanCount' AS INT), 0) AS fullScanCount,
COALESCE(CAST(metadata -> 'failedCount' AS INT), 0) AS failedCount,
metadata ->> 'query' AS query,
metadata ->> 'querySummary' AS querySummary,
(SELECT string_agg(elem::text, ',')
Expand Down Expand Up @@ -709,7 +708,6 @@ SELECT
aggregated_ts,
COALESCE(CAST(metadata -> 'distSQLCount' AS INT), 0) AS distSQLCount,
COALESCE(CAST(metadata -> 'fullScanCount' AS INT), 0) AS fullScanCount,
COALESCE(CAST(metadata -> 'failedCount' AS INT), 0) AS failedCount,
metadata ->> 'query' AS query,
metadata ->> 'querySummary' AS querySummary,
(SELECT string_agg(elem::text, ',')
Expand Down Expand Up @@ -797,25 +795,23 @@ FROM (SELECT fingerprint_id,
aggregatedTs := tree.MustBeDTimestampTZ(row[2]).Time
distSQLCount := int64(*row[3].(*tree.DInt))
fullScanCount := int64(*row[4].(*tree.DInt))
failedCount := int64(*row[5].(*tree.DInt))
query := string(tree.MustBeDString(row[6]))
querySummary := string(tree.MustBeDString(row[7]))
databases := string(tree.MustBeDString(row[8]))
query := string(tree.MustBeDString(row[5]))
querySummary := string(tree.MustBeDString(row[6]))
databases := string(tree.MustBeDString(row[7]))

metadata := appstatspb.CollectedStatementStatistics{
Key: appstatspb.StatementStatisticsKey{
App: app,
DistSQL: distSQLCount > 0,
FullScan: fullScanCount > 0,
Failed: failedCount > 0,
Query: query,
QuerySummary: querySummary,
Database: databases,
},
}

var stats appstatspb.StatementStatistics
statsJSON := tree.MustBeDJSON(row[9]).JSON
statsJSON := tree.MustBeDJSON(row[8]).JSON
if err = sqlstatsutil.DecodeStmtStatsStatisticsJSON(statsJSON, &stats); err != nil {
return nil, srverrors.ServerError(ctx, err)
}
Expand Down
5 changes: 0 additions & 5 deletions pkg/server/statement_details.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,11 @@ func getStatementDetails(
// since the metadata is unique per plan hash.
// Update the statementTotal.Metadata.*Count with the counts from statementStatisticsPerPlanHash.keyData.
statementTotal.Metadata.DistSQLCount = 0
statementTotal.Metadata.FailedCount = 0
statementTotal.Metadata.FullScanCount = 0
statementTotal.Metadata.VecCount = 0
statementTotal.Metadata.TotalCount = 0
for _, planStats := range statementStatisticsPerPlanHash {
statementTotal.Metadata.DistSQLCount += planStats.Metadata.DistSQLCount
statementTotal.Metadata.FailedCount += planStats.Metadata.FailedCount
statementTotal.Metadata.FullScanCount += planStats.Metadata.FullScanCount
statementTotal.Metadata.VecCount += planStats.Metadata.VecCount
statementTotal.Metadata.TotalCount += planStats.Metadata.TotalCount
Expand Down Expand Up @@ -620,9 +618,6 @@ LIMIT $%d`, whereClause, len(args)), args...)
if aggregatedMetadata.DistSQLCount > 0 {
aggregatedMetadata.DistSQLCount = metadata.Stats.Count
}
if aggregatedMetadata.FailedCount > 0 {
aggregatedMetadata.FailedCount = metadata.Stats.Count
}
if aggregatedMetadata.FullScanCount > 0 {
aggregatedMetadata.FullScanCount = metadata.Stats.Count
}
Expand Down
13 changes: 3 additions & 10 deletions pkg/sql/appstatspb/app_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ import (
type StmtFingerprintID uint64

// ConstructStatementFingerprintID constructs an ID by hashing query with
// constants redacted, its database and failure status, and if it was part of an
// constants redacted, its database, and if it was part of an
// implicit txn. At the time of writing, these are the axis' we use to bucket
// queries for stats collection (see stmtKey).
func ConstructStatementFingerprintID(
stmtNoConstants string, failed bool, implicitTxn bool, database string,
stmtNoConstants string, implicitTxn bool, database string,
) StmtFingerprintID {
fnv := util.MakeFNV64()
for _, c := range stmtNoConstants {
Expand All @@ -33,11 +33,6 @@ func ConstructStatementFingerprintID(
for _, c := range database {
fnv.Add(uint64(c))
}
if failed {
fnv.Add('F')
} else {
fnv.Add('S')
}
if implicitTxn {
fnv.Add('I')
} else {
Expand Down Expand Up @@ -156,9 +151,6 @@ func (s *AggregatedStatementMetadata) Add(other *CollectedStatementStatistics) {
if other.Key.DistSQL {
s.DistSQLCount++
}
if other.Key.Failed {
s.FailedCount++
}
if other.Key.FullScan {
s.FullScanCount++
}
Expand Down Expand Up @@ -214,6 +206,7 @@ func (s *StatementStatistics) Add(other *StatementStatistics) {
}

s.Count += other.Count
s.FailureCount += other.FailureCount
}

// AlmostEqual compares two StatementStatistics and their contained NumericStats
Expand Down
9 changes: 6 additions & 3 deletions pkg/sql/appstatspb/app_stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ message StatementStatistics {
// last_error_code is the last error code for a failed statement, if it exists.
optional string last_error_code = 32 [(gogoproto.nullable) = false];

// failure_count is the count of failed executions for a given statement fingerprint.
optional int64 failure_count = 33 [(gogoproto.nullable) = false];

// Note: be sure to update `sql/app_stats.go` when adding/removing fields here!

reserved 13, 14, 17, 18, 19, 20;
Expand Down Expand Up @@ -209,15 +212,14 @@ message StatementStatisticsKey {
optional string query = 1 [(gogoproto.nullable) = false];
optional string app = 2 [(gogoproto.nullable) = false];
optional bool distSQL = 3 [(gogoproto.nullable) = false];
optional bool failed = 4 [(gogoproto.nullable) = false];
optional bool implicit_txn = 6 [(gogoproto.nullable) = false];
optional bool vec = 7 [(gogoproto.nullable) = false];
optional bool full_scan = 8 [(gogoproto.nullable) = false];
optional string database = 9 [(gogoproto.nullable) = false];
optional uint64 plan_hash = 10 [(gogoproto.nullable) = false];
optional string query_summary = 12 [(gogoproto.nullable) = false];

reserved 5;
reserved 4, 5;
optional uint64 transaction_fingerprint_id = 11
[(gogoproto.nullable) = false,
(gogoproto.customname) = "TransactionFingerprintID",
Expand All @@ -235,11 +237,12 @@ message AggregatedStatementMetadata {
repeated string databases = 6;
optional bool implicit_txn = 7 [(gogoproto.nullable) = false];
optional int64 dist_sql_count = 8 [(gogoproto.nullable) = false, (gogoproto.customname) = "DistSQLCount"];
optional int64 failed_count = 9 [(gogoproto.nullable) = false];
optional int64 full_scan_count = 10 [(gogoproto.nullable) = false];
optional int64 vec_count = 11 [(gogoproto.nullable) = false];
optional int64 total_count = 12 [(gogoproto.nullable) = false];
optional string fingerprint_id = 13 [(gogoproto.nullable) = false, (gogoproto.customname) = "FingerprintID"];

reserved 9;
}

// CollectedStatementStatistics wraps collected timings and metadata for some
Expand Down
2 changes: 0 additions & 2 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2808,7 +2808,6 @@ func (ex *connExecutor) execCopyOut(
f.FormatNode(cmd.Stmt)
stmtFingerprintID := appstatspb.ConstructStatementFingerprintID(
f.CloseAndGetString(),
copyErr != nil,
ex.implicitTxn(),
ex.planner.CurrentDatabase(),
)
Expand Down Expand Up @@ -3073,7 +3072,6 @@ func (ex *connExecutor) execCopyIn(
f.FormatNode(cmd.Stmt)
stmtFingerprintID := appstatspb.ConstructStatementFingerprintID(
f.CloseAndGetString(),
copyErr != nil,
ex.implicitTxn(),
ex.planner.CurrentDatabase(),
)
Expand Down
2 changes: 0 additions & 2 deletions pkg/sql/conn_executor_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,6 @@ func (ex *connExecutor) execStmtInOpenState(
f.FormatNode(ast)
stmtFingerprintID := appstatspb.ConstructStatementFingerprintID(
f.CloseAndGetString(),
execErr != nil,
ex.implicitTxn(),
p.CurrentDatabase(),
)
Expand Down Expand Up @@ -2533,7 +2532,6 @@ func (ex *connExecutor) execStmtInNoTxnState(
f.FormatNode(stmt.AST)
stmtFingerprintID := appstatspb.ConstructStatementFingerprintID(
f.CloseAndGetString(),
execErr != nil,
ex.implicitTxn(),
p.CurrentDatabase(),
)
Expand Down
7 changes: 3 additions & 4 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -1804,7 +1804,8 @@ CREATE TABLE crdb_internal.node_statement_statistics (
latency_seconds_max FLOAT,
latency_seconds_p50 FLOAT,
latency_seconds_p90 FLOAT,
latency_seconds_p99 FLOAT
latency_seconds_p99 FLOAT,
failure_count INT NOT NULL
)`,
populate: func(ctx context.Context, p *planner, _ catalog.DatabaseDescriptor, addRow func(...tree.Datum) error) error {
shouldRedactError := false
Expand Down Expand Up @@ -1850,9 +1851,6 @@ CREATE TABLE crdb_internal.node_statement_statistics (
if stats.Key.DistSQL {
flags = "+"
}
if stats.Key.Failed {
flags = "!" + flags
}

samplePlan := sqlstatsutil.ExplainTreePlanNodeToJSON(&stats.Stats.SensitiveInfo.MostRecentPlanDescription)

Expand Down Expand Up @@ -1958,6 +1956,7 @@ CREATE TABLE crdb_internal.node_statement_statistics (
alloc.NewDFloat(tree.DFloat(stats.Stats.LatencyInfo.P50)), // latency_seconds_p50
alloc.NewDFloat(tree.DFloat(stats.Stats.LatencyInfo.P90)), // latency_seconds_p90
alloc.NewDFloat(tree.DFloat(stats.Stats.LatencyInfo.P99)), // latency_seconds_p99
alloc.NewDInt(tree.DInt(stats.Stats.FailureCount)), // failure_count
)
if err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/executor_statement_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ func (ex *connExecutor) recordStatementSummary(
Vec: flags.IsSet(planFlagVectorized),
ImplicitTxn: flags.IsSet(planFlagImplicitTxn),
FullScan: fullScan,
Failed: stmtErr != nil,
Database: planner.SessionData().Database,
PlanHash: planner.instrumentation.planGist.Hash(),
}
Expand All @@ -191,6 +190,7 @@ func (ex *connExecutor) recordStatementSummary(
SessionID: ex.planner.extendedEvalCtx.SessionID,
StatementID: stmt.QueryID,
AutoRetryCount: automaticRetryCount,
Failed: stmtErr != nil,
AutoRetryReason: ex.state.mu.autoRetryReason,
RowsAffected: rowsAffected,
IdleLatencySec: idleLatSec,
Expand Down
4 changes: 1 addition & 3 deletions pkg/sql/instrumentation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ func TestSampledStatsCollection(t *testing.T) {
Query: stmt,
ImplicitTxn: implicitTxn,
Database: database,
Failed: false,
}
var stats *appstatspb.CollectedStatementStatistics
require.NoError(t, sqlStats.
Expand All @@ -63,8 +62,7 @@ func TestSampledStatsCollection(t *testing.T) {
func(ctx context.Context, statistics *appstatspb.CollectedStatementStatistics) error {
if statistics.Key.Query == key.Query &&
statistics.Key.ImplicitTxn == key.ImplicitTxn &&
statistics.Key.Database == key.Database &&
statistics.Key.Failed == key.Failed {
statistics.Key.Database == key.Database {
stats = statistics
}

Expand Down
1 change: 0 additions & 1 deletion pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -4560,7 +4560,6 @@ value if you rely on the HLC for accuracy.`,
metadata.Databases = util.CombineUnique(metadata.Databases, other.Databases)

metadata.DistSQLCount += other.DistSQLCount
metadata.FailedCount += other.FailedCount
metadata.FullScanCount += other.FullScanCount
metadata.VecCount += other.VecCount
metadata.TotalCount += other.TotalCount
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ func InsertMockedIntoSystemStmtActivity(
Databases: []string{stmtStats.Key.Database},
ImplicitTxn: false,
DistSQLCount: 0,
FailedCount: 0,
FullScanCount: 0,
VecCount: 0,
TotalCount: 0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ func TestSQLStatsJsonEncoding(t *testing.T) {
"querySummary": "{{.String}}",
"db": "{{.String}}",
"distsql": {{.Bool}},
"failed": {{.Bool}},
"implicitTxn": {{.Bool}},
"vec": {{.Bool}},
"fullScan": {{.Bool}}
Expand All @@ -58,6 +57,7 @@ func TestSQLStatsJsonEncoding(t *testing.T) {
"statistics": {
"cnt": {{.Int64}},
"firstAttemptCnt": {{.Int64}},
"failureCount": {{.Int64}},
"maxRetries": {{.Int64}},
"lastExecAt": "{{stringifyTime .Time}}",
"numRows": {
Expand Down Expand Up @@ -586,7 +586,6 @@ func TestSQLStatsJsonEncoding(t *testing.T) {
"querySummary": "{{.String}}",
"implicitTxn": {{.Bool}},
"distSQLCount": {{.Int64}},
"failedCount": {{.Int64}},
"vecCount": {{.Int64}},
"fullScanCount": {{.Int64}},
"totalCount": {{.Int64}},
Expand Down

0 comments on commit 3756c81

Please sign in to comment.