New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
sql/{plan,analyzer}: Cache subquery results when joining against a subquery. #322
Changes from all commits
fe6585f
e164b8a
6a1c943
8253f57
29dcfc9
b5bfb19
92a2261
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -122,6 +122,23 @@ func nodeIsCacheable(n sql.Node, lowestAllowedIdx int) bool { | |
return cacheable | ||
} | ||
|
||
func isDeterminstic(n sql.Node) bool { | ||
res := true | ||
plan.InspectExpressions(n, func(e sql.Expression) bool { | ||
if s, ok := e.(*plan.Subquery); ok { | ||
if !isDeterminstic(s.Query) { | ||
res = false | ||
} | ||
return false | ||
} else if nd, ok := e.(sql.NonDeterministicExpression); ok && nd.IsNonDeterministic() { | ||
res = false | ||
return false | ||
} | ||
return true | ||
}) | ||
return res | ||
} | ||
|
||
// cacheSubqueryResults determines whether it's safe to cache the results for any subquery expressions, and marks the | ||
// subquery as cacheable if so. Caching subquery results is safe in the case that no outer scope columns are referenced, | ||
// and if all expressions in the subquery are deterministic. | ||
|
@@ -142,3 +159,49 @@ func cacheSubqueryResults(ctx *sql.Context, a *Analyzer, n sql.Node, scope *Scop | |
return s, nil | ||
}) | ||
} | ||
|
||
// cacheSubqueryAlisesInJoins will look for joins against subquery aliases that | ||
// will repeatedly execute the subquery, and will insert a *plan.CachedResults | ||
// node on top of those nodes when it is safe to do so. | ||
func cacheSubqueryAlisesInJoins(ctx *sql.Context, a *Analyzer, n sql.Node, scope *Scope) (sql.Node, error) { | ||
n, err := plan.TransformUpWithParent(n, func(child, parent sql.Node, childNum int) (sql.Node, error) { | ||
_, isJoin := parent.(plan.JoinNode) | ||
_, isIndexedJoin := parent.(*plan.IndexedJoin) | ||
if isJoin || isIndexedJoin { | ||
sa, isSubqueryAlias := child.(*plan.SubqueryAlias) | ||
if isSubqueryAlias && isDeterminstic(sa.Child) { | ||
return plan.NewCachedResults(child), nil | ||
} | ||
} | ||
return child, nil | ||
}) | ||
if err != nil { | ||
return n, err | ||
} | ||
|
||
// If the most primary table in the top level join is a CachedResults, remove it. | ||
// We only want to do this if we're at the top of the tree. | ||
// TODO: Not a perfect indicator of whether we're at the top of the tree... | ||
if scope == nil { | ||
selector := func(parent sql.Node, child sql.Node, childNum int) bool { | ||
if _, isIndexedJoin := parent.(*plan.IndexedJoin); isIndexedJoin { | ||
return childNum == 0 | ||
} else if j, isJoin := parent.(plan.JoinNode); isJoin { | ||
if j.JoinType() == plan.JoinTypeRight { | ||
return childNum == 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is wrong -- indexed joins always put the primary table as the first table, even if it's a right join |
||
} else { | ||
return childNum == 0 | ||
} | ||
} | ||
return true | ||
} | ||
n, err = plan.TransformUpWithSelector(n, selector, func(n sql.Node) (sql.Node, error) { | ||
cr, isCR := n.(*plan.CachedResults) | ||
if isCR { | ||
return cr.UnaryNode.Child, nil | ||
} | ||
return n, nil | ||
}) | ||
} | ||
return n, err | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
// Copyright 2021 Dolthub, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package plan | ||
|
||
import ( | ||
"io" | ||
"sync" | ||
|
||
"github.com/dolthub/go-mysql-server/sql" | ||
) | ||
|
||
// NewCachedResults returns a cached results plan Node, which will use a | ||
// RowCache to cache results generated by Child.RowIter() and return those | ||
// results for future calls to RowIter. This node is only safe to use if the | ||
// Child is determinstic and is not dependent on the |row| parameter in the | ||
// call to RowIter. | ||
func NewCachedResults(n sql.Node) *CachedResults { | ||
return &CachedResults{UnaryNode: UnaryNode{n}} | ||
} | ||
|
||
type CachedResults struct { | ||
UnaryNode | ||
cache sql.RowsCache | ||
dispose sql.DisposeFunc | ||
mutex sync.Mutex | ||
noCache bool | ||
} | ||
|
||
func (n *CachedResults) RowIter(ctx *sql.Context, r sql.Row) (sql.RowIter, error) { | ||
n.mutex.Lock() | ||
defer n.mutex.Unlock() | ||
if n.cache != nil { | ||
return sql.RowsToRowIter(n.cache.Get()...), nil | ||
} else if n.noCache { | ||
return n.UnaryNode.Child.RowIter(ctx, r) | ||
} | ||
ci, err := n.UnaryNode.Child.RowIter(ctx, r) | ||
if err != nil { | ||
return nil, err | ||
} | ||
cache, dispose := ctx.Memory.NewRowsCache() | ||
return &cachedResultsIter{n, ci, cache, dispose}, nil | ||
} | ||
|
||
func (n *CachedResults) Dispose() { | ||
if n.dispose != nil { | ||
n.dispose() | ||
} | ||
} | ||
|
||
func (n *CachedResults) String() string { | ||
pr := sql.NewTreePrinter() | ||
_ = pr.WriteNode("CachedResults") | ||
_ = pr.WriteChildren(n.UnaryNode.Child.String()) | ||
return pr.String() | ||
} | ||
|
||
func (n *CachedResults) DebugString() string { | ||
pr := sql.NewTreePrinter() | ||
_ = pr.WriteNode("CachedResults") | ||
_ = pr.WriteChildren(sql.DebugString(n.UnaryNode.Child)) | ||
return pr.String() | ||
} | ||
|
||
func (n *CachedResults) WithChildren(children ...sql.Node) (sql.Node, error) { | ||
if len(children) != 1 { | ||
return nil, sql.ErrInvalidChildrenNumber.New(n, len(children), 1) | ||
} | ||
nn := *n | ||
nn.UnaryNode.Child = children[0] | ||
return &nn, nil | ||
} | ||
|
||
|
||
|
||
type cachedResultsIter struct { | ||
parent *CachedResults | ||
iter sql.RowIter | ||
cache sql.RowsCache | ||
dispose sql.DisposeFunc | ||
} | ||
|
||
func (i *cachedResultsIter) Next() (sql.Row, error) { | ||
r, err := i.iter.Next() | ||
if i.cache != nil { | ||
if err != nil { | ||
if err == io.EOF { | ||
i.parent.mutex.Lock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would be a little easier to read with fillCacheInParent method |
||
defer i.parent.mutex.Unlock() | ||
i.setCacheInParent() | ||
} else { | ||
i.cleanUp() | ||
} | ||
} else { | ||
aerr := i.cache.Add(r) | ||
if aerr != nil { | ||
i.cleanUp() | ||
i.parent.mutex.Lock() | ||
defer i.parent.mutex.Unlock() | ||
i.parent.noCache = true | ||
} | ||
} | ||
} | ||
return r, err | ||
} | ||
|
||
func (i *cachedResultsIter) setCacheInParent() { | ||
if i.parent.cache == nil { | ||
i.parent.cache = i.cache | ||
i.parent.dispose = i.dispose | ||
i.cache = nil | ||
i.dispose = nil | ||
} else { | ||
i.cleanUp() | ||
} | ||
} | ||
|
||
func (i *cachedResultsIter) cleanUp() { | ||
if i.dispose != nil { | ||
i.dispose() | ||
i.cache = nil | ||
i.dispose = nil | ||
} | ||
} | ||
|
||
func (i *cachedResultsIter) Close(ctx *sql.Context) error { | ||
i.cleanUp() | ||
return i.iter.Close(ctx) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is going to waste memory unnecessarily if the subquery is the first table in the join and therefore unnecessary to cache. Maybe add a TODO to fix? (algorithm to determine this case doesn't map easily to one of the transform up methods, i don't think)