Skip to content

Commit

Permalink
sql: collect table statistics on virtual computed columns
Browse files Browse the repository at this point in the history
Add rendering of virtual computed column expressions to the CREATE
STATISTICS distsql plan. This rendering should always be added as post
processors on the TableReader nodes that feed the Samplers.

With this change we now collect table statistics on virtual computed
columns. A future PR will make use of the statistsics in statistics
builder.

Collection of partial statistics (USING EXTREMES) on virtual computed
columns simply works after this change because partial statistics
collection utilizes secondary indexes, where virtual computed columns
are regular stored columns.

Informs: #68254

Epic: CRDB-8949

Release note (sql change): Add a new cluster setting,
`sql.stats.virtual_computed_columns.enabled`, which when set enables
collection of table statistics on virtual computed columns.
  • Loading branch information
michae2 committed Jan 30, 2024
1 parent 70150a6 commit c87fbda
Show file tree
Hide file tree
Showing 5 changed files with 540 additions and 38 deletions.
72 changes: 55 additions & 17 deletions pkg/sql/create_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,25 @@ var featureStatsEnabled = settings.RegisterBoolSetting(
featureflag.FeatureFlagEnabledDefault,
settings.WithPublic)

var statsOnVirtualCols = settings.RegisterBoolSetting(
settings.ApplicationLevel,
"sql.stats.virtual_computed_columns.enabled",
"set to true to collect table statistics on virtual computed columns",
true,
settings.WithPublic)

const nonIndexColHistogramBuckets = 2

// StubTableStats generates "stub" statistics for a table which are missing
// histograms and have 0 for all values.
// statistics on virtual computed columns, multi-column stats, and histograms,
// and have 0 for all values.
func StubTableStats(
desc catalog.TableDescriptor, name string, multiColEnabled bool, defaultHistogramBuckets uint32,
desc catalog.TableDescriptor, name string,
) ([]*stats.TableStatisticProto, error) {
colStats, err := createStatsDefaultColumns(desc, multiColEnabled, defaultHistogramBuckets)
colStats, err := createStatsDefaultColumns(
context.Background(), desc, false /* virtColEnabled */, false, /* multiColEnabled */
nonIndexColHistogramBuckets, nil, /* evalCtx */
)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -235,17 +246,18 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
var colStats []jobspb.CreateStatsDetails_ColStat
var deleteOtherStats bool
if len(n.ColumnNames) == 0 {
// Disable multi-column stats and deleting stats
// if partial statistics at the extremes are requested.
// TODO (faizaanmadhani): Add support for multi-column stats.
virtColEnabled := statsOnVirtualCols.Get(n.p.ExecCfg().SV())
// Disable multi-column stats and deleting stats if partial statistics at
// the extremes are requested.
// TODO(faizaanmadhani): Add support for multi-column stats.
var multiColEnabled bool
if !n.Options.UsingExtremes {
multiColEnabled = stats.MultiColumnStatisticsClusterMode.Get(&n.p.ExecCfg().Settings.SV)
multiColEnabled = stats.MultiColumnStatisticsClusterMode.Get(n.p.ExecCfg().SV())
deleteOtherStats = true
}
defaultHistogramBuckets := stats.GetDefaultHistogramBuckets(n.p.ExecCfg().SV(), tableDesc)
if colStats, err = createStatsDefaultColumns(
tableDesc, multiColEnabled, defaultHistogramBuckets,
ctx, tableDesc, virtColEnabled, multiColEnabled, defaultHistogramBuckets, n.p.EvalContext(),
); err != nil {
return nil, err
}
Expand All @@ -257,7 +269,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro

columnIDs := make([]descpb.ColumnID, len(columns))
for i := range columns {
if columns[i].IsVirtual() {
if columns[i].IsVirtual() && !statsOnVirtualCols.Get(n.p.ExecCfg().SV()) {
return nil, pgerror.Newf(
pgcode.InvalidColumnReference,
"cannot create statistics on virtual column %q",
Expand Down Expand Up @@ -357,12 +369,41 @@ const maxNonIndexCols = 100
// other columns from the table. We only collect histograms for index columns,
// plus any other boolean or enum columns (where the "histogram" is tiny).
func createStatsDefaultColumns(
desc catalog.TableDescriptor, multiColEnabled bool, defaultHistogramBuckets uint32,
ctx context.Context,
desc catalog.TableDescriptor,
virtColEnabled, multiColEnabled bool,
defaultHistogramBuckets uint32,
evalCtx *eval.Context,
) ([]jobspb.CreateStatsDetails_ColStat, error) {
colStats := make([]jobspb.CreateStatsDetails_ColStat, 0, len(desc.ActiveIndexes()))

requestedStats := make(map[string]struct{})

// CREATE STATISTICS only runs as a fully-distributed plan. If statistics on
// virtual computed columns are enabled, we must check whether each virtual
// computed column expression is safe to distribute. Virtual computed columns
// with expressions *not* safe to distribute will be skipped, even if
// sql.stats.virtual_computed_columns.enabled is true.
cannotDistribute := make([]bool, len(desc.PublicColumns()))
if virtColEnabled {
semaCtx := tree.MakeSemaContext()
exprs, _, err := schemaexpr.MakeComputedExprs(
ctx,
desc.PublicColumns(),
desc.PublicColumns(),
desc,
tree.NewUnqualifiedTableName(tree.Name(desc.GetName())),
evalCtx,
&semaCtx,
)
if err != nil {
return nil, err
}
for i, col := range desc.PublicColumns() {
cannotDistribute[i] = col.IsVirtual() && checkExpr(exprs[i]) != nil
}
}

// sortAndTrackStatsExists adds the given column IDs as a set to the
// requestedStats set. If the columnIDs were already in the set, it returns
// true. As a side-effect sortAndTrackStatsExists also sorts colIDs. NOTE:
Expand All @@ -385,11 +426,8 @@ func createStatsDefaultColumns(
return err
}

// Do not collect stats for virtual computed columns. DistSQLPlanner
// cannot currently collect stats for these columns because it plans
// table readers on the table's primary index which does not include
// virtual computed columns.
if col.IsVirtual() {
// Do not collect stats for virtual computed columns.
if col.IsVirtual() && (!virtColEnabled || cannotDistribute[col.Ordinal()]) {
return nil
}

Expand Down Expand Up @@ -471,7 +509,7 @@ func createStatsDefaultColumns(
if err != nil {
return nil, err
}
if col.IsVirtual() {
if col.IsVirtual() && (!virtColEnabled || cannotDistribute[col.Ordinal()]) {
continue
}
colIDs = append(colIDs, col.GetID())
Expand Down Expand Up @@ -528,7 +566,7 @@ func createStatsDefaultColumns(
col := desc.PublicColumns()[i]

// Do not collect stats for virtual computed columns.
if col.IsVirtual() {
if col.IsVirtual() && (!virtColEnabled || cannotDistribute[col.Ordinal()]) {
continue
}

Expand Down
137 changes: 130 additions & 7 deletions pkg/sql/distsql_plan_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/schemaexpr"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
"github.com/cockroachdb/cockroach/pkg/sql/opt/exec"
"github.com/cockroachdb/cockroach/pkg/sql/parser"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
Expand Down Expand Up @@ -368,14 +371,59 @@ func (dsp *DistSQLPlanner) createStatsPlan(
return nil, errors.New("no stats requested")
}

// Calculate the set of columns we need to scan.
// Calculate the set of columns we need to scan and any virtual computed cols.
var colCfg scanColumnsConfig
var tableColSet catalog.TableColSet
var requestedCols []catalog.Column
var virtComputedCols []catalog.Column
for _, s := range reqStats {
for _, c := range s.columns {
if !tableColSet.Contains(c) {
tableColSet.Add(c)
colCfg.wantedColumns = append(colCfg.wantedColumns, c)
col, err := catalog.MustFindColumnByID(desc, c)
if err != nil {
return nil, err
}
requestedCols = append(requestedCols, col)
if col.IsVirtual() {
virtComputedCols = append(virtComputedCols, col)
} else {
colCfg.wantedColumns = append(colCfg.wantedColumns, c)
}
}
}
}

// Add columns to the scan that are referenced by virtual computed column
// expressions but were not in the requested statistics.
if len(virtComputedCols) != 0 {
exprStrings := make([]string, 0, len(virtComputedCols))
for _, col := range virtComputedCols {
exprStrings = append(exprStrings, col.GetComputeExpr())
}

virtComputedExprs, err := parser.ParseExprs(exprStrings)
if err != nil {
return nil, err
}

for _, expr := range virtComputedExprs {
refColIDs, err := schemaexpr.ExtractColumnIDs(desc, expr)
if err != nil {
return nil, err
}
refColIDs.ForEach(func(c descpb.ColumnID) {
if !tableColSet.Contains(c) {
if _, err = catalog.MustFindColumnByID(desc, c); err != nil {
return
}
tableColSet.Add(c)
// Add the referenced column to the scan.
colCfg.wantedColumns = append(colCfg.wantedColumns, c)
}
})
if err != nil {
return nil, err
}
}
}
Expand All @@ -386,10 +434,6 @@ func (dsp *DistSQLPlanner) createStatsPlan(
if err != nil {
return nil, err
}
var colIdxMap catalog.TableColMap
for i, c := range scan.cols {
colIdxMap.Set(c.GetID(), i)
}
var sb span.Builder
sb.Init(planCtx.EvalContext(), planCtx.ExtendedEvalCtx.Codec, desc, scan.index)
scan.spans, err = sb.UnconstrainedSpans()
Expand All @@ -412,8 +456,87 @@ func (dsp *DistSQLPlanner) createStatsPlan(
}
}

// Add rendering of virtual computed columns.
if len(virtComputedCols) != 0 {
// Resolve names and types.
semaCtx := tree.MakeSemaContext()
virtComputedExprs, _, err := schemaexpr.MakeComputedExprs(
ctx,
virtComputedCols,
scan.cols,
desc,
tree.NewUnqualifiedTableName(tree.Name(desc.GetName())),
planCtx.EvalContext(),
&semaCtx,
)
if err != nil {
return nil, err
}

// Build render expressions for all requested columns.
exprs := make(tree.TypedExprs, len(requestedCols))
resultCols := colinfo.ResultColumnsFromColumns(desc.GetID(), requestedCols)

ivh := tree.MakeIndexedVarHelper(nil /* container */, len(scan.cols))
var scanIdx, virtIdx int
for i, col := range requestedCols {
if col.IsVirtual() {
if virtIdx >= len(virtComputedExprs) {
return nil, errors.AssertionFailedf(
"virtual computed column expressions do not match requested columns: %v vs %v",
virtComputedExprs, requestedCols,
)
}
// Check that the virtual computed column expression can be distributed.
if err := checkExpr(virtComputedExprs[virtIdx]); err != nil {
return nil, err
}
exprs[i] = virtComputedExprs[virtIdx]
virtIdx++
} else {
// Confirm that the scan columns contain the requested column in the
// expected order.
if scanIdx >= len(scan.cols) || scan.cols[scanIdx].GetID() != col.GetID() {
return nil, errors.AssertionFailedf(
"scan columns do not match requested columns: %v vs %v", scan.cols, requestedCols,
)
}
exprs[i] = ivh.IndexedVarWithType(scanIdx, scan.cols[scanIdx].GetType())
scanIdx++
}
}

var rb renderBuilder
rb.init(exec.Node(planNode(&scan)), exec.OutputOrdering{})
for i, expr := range exprs {
exprs[i] = rb.r.ivarHelper.Rebind(expr)
}
rb.setOutput(exprs, resultCols)

err = dsp.createPlanForRender(ctx, p, rb.r, planCtx)
if err != nil {
return nil, err
}
} else {
// No virtual computed columns. Confirm that the scan columns match the
// requested columns.
for i, col := range requestedCols {
if i >= len(scan.cols) || scan.cols[i].GetID() != col.GetID() {
return nil, errors.AssertionFailedf(
"scan columns do not match requested columns: %v vs %v", scan.cols, requestedCols,
)
}
}
}

// Output of the scan or render will be in requestedCols order.
var colIdxMap catalog.TableColMap
for i, col := range requestedCols {
colIdxMap.Set(col.GetID(), i)
}

var sketchSpecs, invSketchSpecs []execinfrapb.SketchSpec
sampledColumnIDs := make([]descpb.ColumnID, len(scan.cols))
sampledColumnIDs := make([]descpb.ColumnID, len(requestedCols))
for _, s := range reqStats {
spec := execinfrapb.SketchSpec{
SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1,
Expand Down
9 changes: 1 addition & 8 deletions pkg/sql/importer/import_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -1037,14 +1037,7 @@ func (r *importResumer) writeStubStatisticsForImportedTables(
distinctCount := uint64(float64(rowCount) * memo.UnknownDistinctCountRatio)
nullCount := uint64(float64(rowCount) * memo.UnknownNullCountRatio)
avgRowSize := uint64(memo.UnknownAvgRowSize)
// Because we don't yet have real distinct and null counts, only produce
// single-column stats to avoid the appearance of perfectly correlated
// columns.
multiColEnabled := false
defaultHistogramBuckets := stats.GetDefaultHistogramBuckets(execCfg.SV(), desc)
statistics, err := sql.StubTableStats(
desc, jobspb.ImportStatsName, multiColEnabled, defaultHistogramBuckets,
)
statistics, err := sql.StubTableStats(desc, jobspb.ImportStatsName)
if err == nil {
for _, statistic := range statistics {
statistic.RowCount = rowCount
Expand Down

0 comments on commit c87fbda

Please sign in to comment.