Skip to content

Commit

Permalink
sql: fix sql activity cache for large cardinality
Browse files Browse the repository at this point in the history
Recreated from #112071

tldr: fix activity update job failing from conflict, caching to much
info when their is high cardinatlity in the stats, and fix cache to
store the correct top stats.

1. Fix activity update failure from upserting same row multiple times.
This was caused by 2 nodes having the same statement, but one only
executed it 2 or 3 time and the other node executed it enough to
generate an index recommendation. The different index recommendation
caused duplicate rows based on primary key to be inserted. It now uses
index recommendation from the merged stats which uses the latest existing
recommendation (we don't update if there is no recommendation).
2. Customers can have transactions that have over 1k unique statement
fingerprint ids. This makes it unreasonable to cache all of the
statement information for all the top transactions. This commit removes
the logic to cache all the statement information for the top
transactions which could cause the table to grow to over 100k for a
single hour. This requires the call to go the statistics table to get
all the necessary statement information for the txn details.
3. The activity cache logic was not correctly selecting the top CPU json
column. It was using the computed index column name instead of the json
field name. The values shown on the UI were correct because the endpoint
still aggregated several columns using the aggregated stats rather than
the separate column.
4. The activity cache logic was using the execution count instead of the
statistics count to select the top queries. In most cases this still
generates the correct top queries, and the UI uses the aggregated stats
which showed the correct value.

Fixes: #111780
Fixes: #111224

Release note (sql change): Fix the SQL activity update job to avoid
conflicts on update, reduces the amount of data cached to just what the
overview page requires, and fix the correctess of the top queries.
  • Loading branch information
maryliag committed Oct 21, 2023
1 parent e252306 commit 37ed721
Show file tree
Hide file tree
Showing 8 changed files with 1,203 additions and 1,070 deletions.
3 changes: 3 additions & 0 deletions pkg/server/application_api/sql_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,9 @@ func TestStatusAPICombinedStatements(t *testing.T) {
for _, respTxn := range resp.Transactions {
delete(expectedTxnFingerprints, respTxn.StatsData.TransactionFingerprintID)
}
// We set the default value of Transaction Fingerprint as 0 for some cases,
// so this also needs to be removed from the list.
delete(expectedTxnFingerprints, 0)

sort.Strings(expectedStmts)
sort.Strings(statementsInResponse)
Expand Down
86 changes: 22 additions & 64 deletions pkg/server/combined_statement_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ func getCombinedStatementStats(
req,
transactions,
testingKnobs,
activityHasAllData,
tableSuffix)
} else {
statements, err = collectCombinedStatements(
Expand Down Expand Up @@ -672,11 +671,10 @@ func collectCombinedStatements(
tableSuffix string,
) ([]serverpb.StatementsResponse_CollectedStatementStatistics, error) {
aostClause := testingKnobs.GetAOSTClause()
const expectedNumDatums = 11
const expectedNumDatums = 10
const queryFormat = `
SELECT
fingerprint_id,
txn_fingerprints,
app_name,
aggregated_ts,
COALESCE(CAST(metadata -> 'distSQLCount' AS INT), 0) AS distSQLCount,
Expand All @@ -688,7 +686,6 @@ SELECT
FROM json_array_elements_text(metadata->'db') AS elem) AS databases,
statistics
FROM (SELECT fingerprint_id,
array_agg(distinct transaction_fingerprint_id) AS txn_fingerprints,
app_name,
max(aggregated_ts) AS aggregated_ts,
crdb_internal.merge_stats_metadata(array_agg(metadata)) AS metadata,
Expand All @@ -713,7 +710,6 @@ FROM (SELECT fingerprint_id,
`
SELECT
fingerprint_id,
txn_fingerprints,
app_name,
aggregated_ts,
COALESCE(CAST(metadata -> 'distSQLCount' AS INT), 0) AS distSQLCount,
Expand All @@ -725,7 +721,6 @@ SELECT
FROM json_array_elements_text(metadata->'db') AS elem) AS databases,
statistics
FROM (SELECT fingerprint_id,
array_agg(distinct transaction_fingerprint_id) AS txn_fingerprints,
app_name,
max(aggregated_ts) AS aggregated_ts,
crdb_internal.merge_aggregated_stmt_metadata(array_agg(metadata)) AS metadata,
Expand Down Expand Up @@ -802,25 +797,15 @@ FROM (SELECT fingerprint_id,
return nil, srverrors.ServerError(ctx, err)
}

var txnFingerprintID uint64
txnFingerprintDatums := tree.MustBeDArray(row[1])
txnFingerprintIDs := make([]appstatspb.TransactionFingerprintID, 0, txnFingerprintDatums.Array.Len())
for _, idDatum := range txnFingerprintDatums.Array {
if txnFingerprintID, err = sqlstatsutil.DatumToUint64(idDatum); err != nil {
return nil, srverrors.ServerError(ctx, err)
}
txnFingerprintIDs = append(txnFingerprintIDs, appstatspb.TransactionFingerprintID(txnFingerprintID))
}

app := string(tree.MustBeDString(row[2]))
app := string(tree.MustBeDString(row[1]))

aggregatedTs := tree.MustBeDTimestampTZ(row[3]).Time
distSQLCount := int64(*row[4].(*tree.DInt))
fullScanCount := int64(*row[5].(*tree.DInt))
failedCount := int64(*row[6].(*tree.DInt))
query := string(tree.MustBeDString(row[7]))
querySummary := string(tree.MustBeDString(row[8]))
databases := string(tree.MustBeDString(row[9]))
aggregatedTs := tree.MustBeDTimestampTZ(row[2]).Time
distSQLCount := int64(*row[3].(*tree.DInt))
fullScanCount := int64(*row[4].(*tree.DInt))
failedCount := int64(*row[5].(*tree.DInt))
query := string(tree.MustBeDString(row[6]))
querySummary := string(tree.MustBeDString(row[7]))
databases := string(tree.MustBeDString(row[8]))

metadata := appstatspb.CollectedStatementStatistics{
Key: appstatspb.StatementStatisticsKey{
Expand All @@ -835,7 +820,7 @@ FROM (SELECT fingerprint_id,
}

var stats appstatspb.StatementStatistics
statsJSON := tree.MustBeDJSON(row[10]).JSON
statsJSON := tree.MustBeDJSON(row[9]).JSON
if err = sqlstatsutil.DecodeStmtStatsStatisticsJSON(statsJSON, &stats); err != nil {
return nil, srverrors.ServerError(ctx, err)
}
Expand All @@ -847,7 +832,7 @@ FROM (SELECT fingerprint_id,
},
ID: appstatspb.StmtFingerprintID(statementFingerprintID),
Stats: stats,
TxnFingerprintIDs: txnFingerprintIDs,
TxnFingerprintIDs: []appstatspb.TransactionFingerprintID{appstatspb.InvalidTransactionFingerprintID},
}

statements = append(statements, stmt)
Expand Down Expand Up @@ -1026,13 +1011,15 @@ FROM (SELECT app_name,
return transactions, nil
}

// This does not use the activity tables because the statement information is
// aggregated to remove the transaction fingerprint id to keep the size of the
// statement_activity manageable when the transactions can have over 1k+ statement ids.
func collectStmtsForTxns(
ctx context.Context,
ie *sql.InternalExecutor,
req *serverpb.CombinedStatementsStatsRequest,
transactions []serverpb.StatementsResponse_ExtendedCollectedTransactionStatistics,
testingKnobs *sqlstats.TestingKnobs,
activityTableHasAllData bool,
tableSuffix string,
) ([]serverpb.StatementsResponse_CollectedStatementStatistics, error) {

Expand All @@ -1055,44 +1042,15 @@ GROUP BY
var it isql.Rows
var err error

if activityTableHasAllData {
it, err = ie.QueryIteratorEx(ctx, "console-combined-stmts-activity-for-txn", nil,
sessiondata.NodeUserSessionDataOverride, fmt.Sprintf(`
SELECT fingerprint_id,
transaction_fingerprint_id,
crdb_internal.merge_aggregated_stmt_metadata(array_agg(metadata)) AS metadata,
crdb_internal.merge_statement_stats(array_agg(statistics)) AS statistics,
app_name
FROM crdb_internal.statement_activity %s
GROUP BY
fingerprint_id,
transaction_fingerprint_id,
app_name`, whereClause),
args...)
if err != nil {
return nil, srverrors.ServerError(ctx, err)
}
}

// If there are no results from the activity table, retrieve the data from the persisted table.
var query string
if it == nil || !it.HasResults() {
if it != nil {
err = closeIterator(it, err)
if err != nil {
return nil, srverrors.ServerError(ctx, err)
}
}
query = fmt.Sprintf(
queryFormat,
crdbInternalStmtStatsPersisted+tableSuffix,
whereClause)
it, err = ie.QueryIteratorEx(ctx, "console-combined-stmts-persisted-for-txn", nil,
sessiondata.NodeUserSessionDataOverride, query, args...)
query := fmt.Sprintf(
queryFormat,
crdbInternalStmtStatsPersisted+tableSuffix,
whereClause)
it, err = ie.QueryIteratorEx(ctx, "console-combined-stmts-persisted-for-txn", nil,
sessiondata.NodeUserSessionDataOverride, query, args...)

if err != nil {
return nil, srverrors.ServerError(ctx, err)
}
if err != nil {
return nil, srverrors.ServerError(ctx, err)
}

// If there are no results from the persisted table, retrieve the data from the combined view
Expand Down
23 changes: 13 additions & 10 deletions pkg/sql/appstatspb/app_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,26 +188,29 @@ func (s *StatementStatistics) Add(other *StatementStatistics) {
s.Nodes = util.CombineUnique(s.Nodes, other.Nodes)
s.Regions = util.CombineUnique(s.Regions, other.Regions)
s.PlanGists = util.CombineUnique(s.PlanGists, other.PlanGists)
s.IndexRecommendations = other.IndexRecommendations
s.Indexes = util.CombineUnique(s.Indexes, other.Indexes)

s.ExecStats.Add(other.ExecStats)
s.LatencyInfo.Add(other.LatencyInfo)

if other.SensitiveInfo.LastErr != "" {
s.SensitiveInfo.LastErr = other.SensitiveInfo.LastErr
}

if other.LastErrorCode != "" {
s.LastErrorCode = other.LastErrorCode
}

if s.SensitiveInfo.MostRecentPlanTimestamp.Before(other.SensitiveInfo.MostRecentPlanTimestamp) {
s.SensitiveInfo = other.SensitiveInfo
}

// Use the LastExecTimestamp to decide which object has the last error.
if s.LastExecTimestamp.Before(other.LastExecTimestamp) {
s.LastExecTimestamp = other.LastExecTimestamp

if other.SensitiveInfo.LastErr != "" {
s.SensitiveInfo.LastErr = other.SensitiveInfo.LastErr
}

if other.LastErrorCode != "" {
s.LastErrorCode = other.LastErrorCode
}
}

if len(other.IndexRecommendations) > 0 {
s.IndexRecommendations = other.IndexRecommendations
}

s.Count += other.Count
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -7046,8 +7046,8 @@ CREATE VIEW crdb_internal.statement_activity AS
{Name: "execution_count", Typ: types.Int},
{Name: "execution_total_seconds", Typ: types.Float},
{Name: "execution_total_cluster_seconds", Typ: types.Float},
{Name: "cpu_sql_avg_nanos", Typ: types.Float},
{Name: "contention_time_avg_seconds", Typ: types.Float},
{Name: "cpu_sql_avg_nanos", Typ: types.Float},
{Name: "service_latency_avg_seconds", Typ: types.Float},
{Name: "service_latency_p99_seconds", Typ: types.Float},
},
Expand Down

0 comments on commit 37ed721

Please sign in to comment.