Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

execinfra: admission control interface #97598

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/sql/execinfra/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ load("//pkg/testutils/buildutil:buildutil.bzl", "disallowed_imports_test")
go_library(
name = "execinfra",
srcs = [
"admission.go",
"base.go",
"flow_context.go",
"metrics.go",
Expand Down Expand Up @@ -60,6 +61,7 @@ go_library(
"//pkg/sql/types",
"//pkg/storage/fs",
"//pkg/util/admission",
"//pkg/util/admission/admissionpb",
"//pkg/util/buildutil",
"//pkg/util/intsets",
"//pkg/util/limit",
Expand Down
184 changes: 184 additions & 0 deletions pkg/sql/execinfra/admission.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package execinfra

import (
"context"
"time"

"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/admission"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
)

type WorkInfo struct {
// WorkloadID is the id of the workload, where a workloads get a fair share
// of the underlying resource (CPU). This could be the TenantID, or some
// other finer-grained way of separating workloads. The admission control
// sub-system is equipped to do weighted fair-sharing, and if needed we can
// expose a way to set the weights.
//
// Don't use 0, since it is a reserved, invalid id.
WorkloadID uint64
// Priority is utilized within a workload. Lower priority work can be
// starved by higher priority work.
Priority admissionpb.WorkPriority
// CreateTime is equivalent to Time.UnixNano() at the creation time of this
// work or a parent work (e.g. could be the start time of the transaction,
// if this work was created as part of a transaction). It is used to order
// work within a (WorkloadID, Priority) pair -- earlier CreateTime is given
// preference.
CreateTime int64
}

type Handle struct {
workInfo WorkInfo
cpuTime time.Duration
firstAdmitCalled bool
callAdmittedWorkDone bool
}

// Controller implements an admission control interface for SQL code.
// SQL code is assumed to be primarily CPU bound.
//
// This is a strawman based on the discussion in
// https://github.com/cockroachdb/cockroach/issues/85471. It is geared towards
// doing a blocking admit at a coarse granularity, while providing more
// fine-grained CPU consumption information (for fairness and accounting
// purposes). It assumes that the instrumentation points for these two are
// same, so it hides the details on when to do a blocking admit. See comment
// in
// https://github.com/cockroachdb/cockroach/issues/85471#issuecomment-1440771823.
type Controller interface {
InitHandle(WorkInfo, *Handle)
// TryAdmit and AdmittedWorkDone must be paired to encompass the CPU done within SQL.
// - The work done within the (TryAdmit, AdmittedWorkDone) must not recursively call another
// TryAdmit. This can be accomplished by the buffering scheme described in
// https://github.com/cockroachdb/cockroach/issues/85471#issuecomment-1440771823 which does
// this bottom up in the operator tree.
// - The work done within the (TryAdmit, AdmittedWorkDone) must not call
// into KV. This means the aforementioned bottom up scheme needs to be
// tweaked such that the leaf operator that is calling into KV should not
// do (TryAdmit, AdmittedWorkDone) around it's Next call. This means we
// need to exclude such operators from the generic admissionOp wrapper --
// hopefully there are only a few of these so not hard to make that
// exception.
TryAdmit(context.Context, *Handle) error
AdmittedWorkDone(h *Handle, cpuTime time.Duration)
}

type noopControllerImpl struct{}

var _ Controller = &noopControllerImpl{}

func (c *noopControllerImpl) InitHandle(WorkInfo, *Handle) {}

func (c *noopControllerImpl) TryAdmit(context.Context, *Handle) error {
return nil
}

func (c *noopControllerImpl) AdmittedWorkDone(h *Handle, cpuTime time.Duration) {}

type controllerImpl struct {
// Same as the current KV admission queue, since used for both KV and SQL.
cpuAdmissionQ *admission.WorkQueue
}

func (c *controllerImpl) InitHandle(w WorkInfo, h *Handle) {
*h = Handle{
workInfo: w,
}
}

func (c *controllerImpl) TryAdmit(ctx context.Context, h *Handle) error {
if h.firstAdmitCalled && h.cpuTime < 5*time.Millisecond {
return nil
}
enabled, err := c.cpuAdmissionQ.Admit(ctx, admission.WorkInfo{
TenantID: roachpb.TenantID{InternalValue: h.workInfo.WorkloadID},
Priority: h.workInfo.Priority,
CreateTime: h.workInfo.CreateTime,
})
if err != nil {
return err
}
if enabled {
h.callAdmittedWorkDone = true
}
h.cpuTime = 0
if !h.firstAdmitCalled {
h.firstAdmitCalled = true
}
return nil
}

func (c *controllerImpl) AdmittedWorkDone(h *Handle, cpuTime time.Duration) {
if h.callAdmittedWorkDone {
c.cpuAdmissionQ.AdmittedWorkDone(roachpb.TenantID{InternalValue: h.workInfo.WorkloadID})
h.callAdmittedWorkDone = false
}
// TODO(sumeer): also plumb the cpuTime to the WorkQueue when we do
// https://github.com/cockroachdb/cockroach/issues/91533.
h.cpuTime += cpuTime
}

// Discussion of issues etc.:
// - After the fact accounting is good: it helps with fairness in future
// allocations and understanding which workloads are consuming CPU.
// - (TryAdmit, AdmittedWorkDone) pair should include the actual work. This is
// because slots control instantaneous concurrency of work, and not shaping
// to a rate.
// - If they don't include the actual work the instantaneous concurrency
// number is nonsensical. So probably what I said on
// https://github.com/cockroachdb/cockroach/issues/85471#issuecomment-1440687926
// about calling it "before sending a new ProducerMessage" does not make
// sense.
// - We are sampling in the (TryAdmit, AdmittedWorkDone) to reduce the
// overhead of interacting with AC. Let's calls the sampled ones (Admit,
// AdmittedWorkDone). How effective is AC with this sampling? Say we
// sampled after 10ms of accumulated cpu time, and each TryAdmit did 1ms
// of cpu time. So we are sampling 1 in 10 calls. Then the actual
// concurrency will be 10x of what is being observed by AC.
// - Deadlock when consuming same concurrency slot as kv (different from the
// deadlock discussed in
// https://github.com/cockroachdb/cockroach/issues/85471 until Feb 22). If
// some of the work in the (Admit, AdmittedWorkDone) interval involves
// calling into KV, we can deadlock if that KV work blocks for a concurrency
// slot. This is why I had originally been trying to do the Admit call on
// the response path (and forgot about that in the recent discussions).
//
// Possible solution:
// - What if we shape the rate of cpu consumption of SQL?
// - Harder to make it work conserving as discussed in
// https://docs.google.com/document/d/16RISoB3zOORf24k0fQPDJ4tTCisBVh-PkCZEit4iwV8/edit#heading=h.amw25se2ob5m
// and
// https://docs.google.com/document/d/16RISoB3zOORf24k0fQPDJ4tTCisBVh-PkCZEit4iwV8/edit#bookmark=id.479u56nbiz7
// - Harder to make it work along with KV slots. Would then want to convert
// KV slots to also use cpu tokens.
//
// - Only solve the deadlock issue identified above.
// - Ignore the 10x actual concurrency (in the example) due to sampling when
// calling (Admit, AdmittedWorkDone). We are setting
// admission.kv_slot_adjuster.overload_threshold=32 so we are already
// allowing a significant backlog in the goroutine scheduler. So we retain
// enough control via AC -- the kvSlotAdjuster will have see this
// burstiness and decrease the slots.
// - Solve the deadlock problem by not calling (Admit, AdmittedWorkDone) in
// the operator that will call into KV for the batch.

// Next steps (in sequence):
// - TODO(irfansharif): look at discussion on https://github.com/cockroachdb/cockroach/issues/85471
// and this code and check for sanity.
// - TODO(drewk): attempt to instrument SQL with this interface.
// - TODO(sumeer,irfansharif,drewk):
// - after the instrumentation do the plumbing to create the real
// controllerImpl and place it in a place that SQL can use.
// - run experiments with the new scheme.
88 changes: 88 additions & 0 deletions pkg/util/admission/cpu_admission.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package admission

import (
"context"
"time"
)

// CPUAdmission is an alternative interface for (non-elastic) CPU admission,
// that discards the slot based mechanism, in favor of tokens and only
// after-the-consumption accounting and blocking. Allows for KV, SQL-KV,
// SQL-SQL to be unified.
//
// Why?
//
// - Priority inversion discussed in
// https://github.com/cockroachdb/cockroach/issues/85471. Solutions for SQL
// that require actual work to be sandwiched between TryAdmit and
// AdmittedWorkDone are unlikely to be viable.
//
// - admission.kv_slot_adjuster.overload_threshold defaults to 32 and causes
// excessive goroutine scheduling latency. We have discussed in
// https://docs.google.com/document/d/16RISoB3zOORf24k0fQPDJ4tTCisBVh-PkCZEit4iwV8/edit#heading=h.amw25se2ob5m
// and
// https://docs.google.com/document/d/16RISoB3zOORf24k0fQPDJ4tTCisBVh-PkCZEit4iwV8/edit#bookmark=id.479u56nbiz7
// that a token based scheme is hard to make work-conserving, but those
// arguments were based on before-the-consumption blocking deduction (like
// we do for elastic work, where work conserving is not necessary) and
// after-the-consumption non-blocking correction. By switching to
// after-the-consumption blocking deduction, we can decide on a fixed burst
// budget as a function of the token generation rate and be ok. For example,
// consider the simple case of 1ms of cpu tokens per ms and a burst budget
// of 20ms. And say only 50% of the actual cpu work is instrumented to use
// AC. If the non-instrumented work consumes 100% of the cpu (because of
// excessive concurrency), the instrumented work over a 20ms interval can at
// most admit 20ms+20ms = 40ms so another 200%, for a total work trying to
// use CPU of 100+200=300%. This is still better than the 3200+100=3300%
// that the current scheme allows.
//
// NB: this will subsume the need for
// https://github.com/cockroachdb/cockroach/issues/91536. However, we can
// still suffer from under-admittance if there is a burst of work finishing,
// that blocked after consuming most of its cpu and then finishes after
// consuming a tiny amount of additional cpu -- we will count all the token
// deduction on completion even though most of the consumption happened a
// while ago. We could rectify this making Admit block if tokens <= 0 AND
// there were idle P's in the last sample of P state (sampled at 1ms
// intervals).
//
// Admit consumes only 1ns of tokens. Since 1ns is tiny, when available tokens
// become positive, we could admit an arbitrary sized burst. We already have a
// solution for this: grant-chaining, which will now be used for for KV work
// too.
type CPUAdmission interface {
Admit(ctx context.Context, work WorkInfo) (CPUAdmissionHandle, error)
}

type CPUAdmissionHandle interface {
// Used can block. Do not use inside KV layer, which may be holding latches.
// For KV wrap in KVWorkBoundingHandle.
Used(ctx context.Context, cpuTime time.Duration) error
}

type KVWorkBoundingHandle struct {
ah CPUAdmissionHandle
// TODO: similar to ElasticCPUWorkHandle in tracking
}

func NewKVWorkBoundingHandle(ah CPUAdmissionHandle) *KVWorkBoundingHandle {
// TODO:
}

func (h *KVWorkBoundingHandle) ShouldReturn() bool {
// TODO: returns true when consumed more than N ms of cpu time on this goroutine.
}

func (h *KVWorkBoundingHandle) Done() bool {
// TODO: calls ah.Used with the cpu consumed.
}