Skip to content

Commit

Permalink
sql,opt: add hint for STRAIGHT join
Browse files Browse the repository at this point in the history
Our current join hints (a INNER HASH JOIN b, a LEFT LOOKUP JOIN b, etc)
fixes both the join order and the join algorithm. This commit adds the
syntax and support for hinting the join order without hinting the
join algorithm. This will be useful for the (few) cases where the
optimizer processes the tables in a suboptimal order.

Resolves: #115308

Release note (sql change): It is now possible to hint to the optimizer
that it should plan a straight join by using the syntax `... INNER
STRAIGHT JOIN ...`. If the hint is provided, the optimizer will now
fix the join order as given in the query, even if it estimates
that a different plan using join reordering would have a lower cost.
  • Loading branch information
kevinmingtarja committed Jan 29, 2024
1 parent 0baf22a commit 365ea77
Show file tree
Hide file tree
Showing 13 changed files with 431 additions and 2 deletions.
3 changes: 3 additions & 0 deletions docs/generated/sql/bnf/stmt_block.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -1430,6 +1430,7 @@ unreserved_keyword ::=
| 'STORE'
| 'STORED'
| 'STORING'
| 'STRAIGHT'
| 'STREAM'
| 'STRICT'
| 'SUBSCRIPTION'
Expand Down Expand Up @@ -4057,6 +4058,7 @@ bare_label_keywords ::=
| 'STORE'
| 'STORED'
| 'STORING'
| 'STRAIGHT'
| 'STREAM'
| 'STRICT'
| 'STRING'
Expand Down Expand Up @@ -4167,6 +4169,7 @@ opt_join_hint ::=
| 'MERGE'
| 'LOOKUP'
| 'INVERTED'
| 'STRAIGHT'
|

join_type ::=
Expand Down
239 changes: 239 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/straight_join
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
# LogicTest: local

statement ok
CREATE TABLE t1 (x INT, PRIMARY KEY (x))

statement ok
CREATE TABLE t2 (x INT, y INT, z INT, PRIMARY KEY (x), INDEX idx_y (y))

# Set up the statistics as if t1 is much smaller than t2.
statement ok
ALTER TABLE t1 INJECT STATISTICS '[
{
"columns": ["x"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 100,
"distinct_count": 100
}
]'

statement ok
ALTER TABLE t2 INJECT STATISTICS '[
{
"columns": ["y", "z"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]'

# --------------------------------------------------
# INNER JOIN
# --------------------------------------------------

# The best plan should be a lookup join into t2 (right).
query T
EXPLAIN (VERBOSE) SELECT * FROM t1 INNER JOIN t2 ON t1.x = t2.y
----
distribution: local
vectorized: true
·
• lookup join (inner)
│ columns: (x, x, y, z)
│ estimated row count: 990
│ table: t2@t2_pkey
│ equality: (x) = (x)
│ equality cols are key
└── • lookup join (inner)
│ columns: (x, x, y)
│ estimated row count: 990
│ table: t2@idx_y
│ equality: (x) = (y)
└── • scan
columns: (x)
estimated row count: 100 (100% of the table; stats collected <hidden> ago)
table: t1@t1_pkey
spans: FULL SCAN

# Should not change the plan, as the table on the right side of the join is still t2.
query T
EXPLAIN (VERBOSE) SELECT * FROM t1 INNER STRAIGHT JOIN t2 ON t1.x = t2.y
----
distribution: local
vectorized: true
·
• lookup join (inner)
│ columns: (x, x, y, z)
│ estimated row count: 990
│ table: t2@t2_pkey
│ equality: (x) = (x)
│ equality cols are key
└── • lookup join (inner)
│ columns: (x, x, y)
│ estimated row count: 990
│ table: t2@idx_y
│ equality: (x) = (y)
└── • scan
columns: (x)
estimated row count: 100 (100% of the table; stats collected <hidden> ago)
table: t1@t1_pkey
spans: FULL SCAN

# Now, the best plan (lookup join into t2) should no longer be picked as t1 is now on the right.
query T
EXPLAIN (VERBOSE) SELECT * FROM t2 INNER STRAIGHT JOIN t1 ON t1.x = t2.y
----
distribution: local
vectorized: true
·
• hash join (inner)
│ columns: (x, y, z, x)
│ estimated row count: 990
│ equality: (y) = (x)
│ right cols are key
├── • scan
│ columns: (x, y, z)
│ estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
│ table: t2@t2_pkey
│ spans: FULL SCAN
└── • scan
columns: (x)
estimated row count: 100 (100% of the table; stats collected <hidden> ago)
table: t1@t1_pkey
spans: FULL SCAN


# The best plan should be a hash join into t1 (smaller table).
query T
EXPLAIN (VERBOSE) SELECT * FROM t1 INNER JOIN t2 ON t1.x = t2.z
----
distribution: local
vectorized: true
·
• hash join (inner)
│ columns: (x, x, y, z)
│ estimated row count: 990
│ equality: (z) = (x)
│ right cols are key
├── • scan
│ columns: (x, y, z)
│ estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
│ table: t2@t2_pkey
│ spans: FULL SCAN
└── • scan
columns: (x)
estimated row count: 100 (100% of the table; stats collected <hidden> ago)
table: t1@t1_pkey
spans: FULL SCAN

# Should not change the plan, as the table on the right side of the join is t1.
query T
EXPLAIN (VERBOSE) SELECT * FROM t2 INNER STRAIGHT JOIN t1 ON t1.x = t2.z
----
distribution: local
vectorized: true
·
• hash join (inner)
│ columns: (x, y, z, x)
│ estimated row count: 990
│ equality: (z) = (x)
│ right cols are key
├── • scan
│ columns: (x, y, z)
│ estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
│ table: t2@t2_pkey
│ spans: FULL SCAN
└── • scan
columns: (x)
estimated row count: 100 (100% of the table; stats collected <hidden> ago)
table: t1@t1_pkey
spans: FULL SCAN

# Now, the best plan (hash join into t1) should no longer be picked as t2 is now on the right.
query T
EXPLAIN (VERBOSE) SELECT * FROM t1 INNER STRAIGHT JOIN t2 ON t1.x = t2.z
----
distribution: local
vectorized: true
·
• hash join (inner)
│ columns: (x, x, y, z)
│ estimated row count: 990
│ equality: (x) = (z)
│ left cols are key
├── • scan
│ columns: (x)
│ estimated row count: 100 (100% of the table; stats collected <hidden> ago)
│ table: t1@t1_pkey
│ spans: FULL SCAN
└── • scan
columns: (x, y, z)
estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
table: t2@t2_pkey
spans: FULL SCAN

# --------------------------------------------------
# LEFT JOIN
# --------------------------------------------------

# The best plan should be a (commuted) right outer hash join into t1.
query T
EXPLAIN (VERBOSE) SELECT * FROM t1 LEFT JOIN t2 ON t1.x = t2.z
----
distribution: local
vectorized: true
·
• hash join (right outer)
│ columns: (x, x, y, z)
│ estimated row count: 1,000
│ equality: (z) = (x)
│ right cols are key
├── • scan
│ columns: (x, y, z)
│ estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
│ table: t2@t2_pkey
│ spans: FULL SCAN
└── • scan
columns: (x)
estimated row count: 100 (100% of the table; stats collected <hidden> ago)
table: t1@t1_pkey
spans: FULL SCAN

# Now, the best plan should no longer be picked, as we're forcing the join order.
query T
EXPLAIN (VERBOSE) SELECT * FROM t1 LEFT STRAIGHT JOIN t2 ON t1.x = t2.z
----
distribution: local
vectorized: true
·
• hash join (left outer)
│ columns: (x, x, y, z)
│ estimated row count: 1,000
│ equality: (x) = (z)
│ left cols are key
├── • scan
│ columns: (x)
│ estimated row count: 100 (100% of the table; stats collected <hidden> ago)
│ table: t1@t1_pkey
│ spans: FULL SCAN
└── • scan
columns: (x, y, z)
estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
table: t2@t2_pkey
spans: FULL SCAN
7 changes: 7 additions & 0 deletions pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/sql/opt/memo/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,11 @@ const (

// AllowOnlyMergeJoin has all "disallow" flags set except DisallowMergeJoin.
AllowOnlyMergeJoin = disallowAll ^ DisallowMergeJoin

// AllowAllJoinsIntoRight has all "disallow" flags set except
// DisallowHashJoinStoreRight, DisallowLookupJoinIntoRight,
// DisallowInvertedJoinIntoRight, and DisallowMergeJoin.
AllowAllJoinsIntoRight = disallowAll ^ DisallowHashJoinStoreRight ^ DisallowLookupJoinIntoRight ^ DisallowInvertedJoinIntoRight ^ DisallowMergeJoin
)

var joinFlagStr = map[JoinFlags]string{
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/opt/optbuilder/join.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ func (b *Builder) buildJoin(
telemetry.Inc(sqltelemetry.MergeJoinHintUseCounter)
flags = memo.AllowOnlyMergeJoin

case tree.AstStraight:
telemetry.Inc(sqltelemetry.StraightJoinHintUseCounter)
flags = memo.AllowAllJoinsIntoRight

default:
panic(pgerror.Newf(
pgcode.FeatureNotSupported, "join hint %s not supported", join.Hint,
Expand Down
15 changes: 15 additions & 0 deletions pkg/sql/opt/optbuilder/testdata/join
Original file line number Diff line number Diff line change
Expand Up @@ -2436,6 +2436,21 @@ project
└── filters
└── x:1 = y:5

build
SELECT * FROM onecolumn AS a(x) INNER STRAIGHT JOIN onecolumn AS b(y) ON a.x = b.y
----
project
├── columns: x:1!null y:5!null
└── inner-join (hash)
├── columns: x:1!null a.rowid:2!null a.crdb_internal_mvcc_timestamp:3 a.tableoid:4 y:5!null b.rowid:6!null b.crdb_internal_mvcc_timestamp:7 b.tableoid:8
├── flags: disallow hash join (store left side) and lookup join (into left side) and inverted join (into left side)
├── scan onecolumn [as=a]
│ └── columns: x:1 a.rowid:2!null a.crdb_internal_mvcc_timestamp:3 a.tableoid:4
├── scan onecolumn [as=b]
│ └── columns: y:5 b.rowid:6!null b.crdb_internal_mvcc_timestamp:7 b.tableoid:8
└── filters
└── x:1 = y:5

build
SELECT * FROM onecolumn AS a NATURAL LEFT LOOKUP JOIN onecolumn as b USING(x)
----
Expand Down
69 changes: 69 additions & 0 deletions pkg/sql/opt/xform/testdata/coster/join
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,75 @@ inner-join (hash)
└── filters
└── k:1 = x:7 [outer=(1,7), constraints=(/1: (/NULL - ]; /7: (/NULL - ]), fd=(1)==(7), (7)==(1)]

# Verify that if we specify a straight join, huge cost is given to
# commuted joins so that non-commuted joins are picked as
# the best plan by the optimizer.
exploretrace rule=CommuteLeftJoin format=(hide-all,show-cost)
SELECT k, x FROM b LEFT STRAIGHT JOIN a ON k=x
----
----
================================================================================
CommuteLeftJoin
================================================================================
Source expression:
left-join (hash)
├── flags: disallow hash join (store left side) and lookup join (into left side) and inverted join (into left side)
├── cost: 2187.10625
├── scan b
│ └── cost: 1068.42
├── scan a
│ └── cost: 1078.52
└── filters
└── k = x

New expression 1 of 1:
right-join (hash)
├── flags: disallow hash join (store right side) and lookup join (into right side) and inverted join (into right side)
├── cost: 1e+100
├── scan a
│ └── cost: 1078.52
├── scan b
│ └── cost: 1068.42
└── filters
└── k = x
----
----

# CommuteRightJoin is a normalization rule and thus does not
# show up in the exploretrace output. But we can verify that
# huge cost is given to commuted joins from one of the
# source expressions (which was commuted).
exploretrace rule=CommuteLeftJoin format=(hide-all,show-cost)
SELECT k, x FROM b RIGHT STRAIGHT JOIN a ON k=x
----
----
================================================================================
CommuteLeftJoin
================================================================================
Source expression:
left-join (hash)
├── flags: disallow hash join (store right side) and lookup join (into right side) and inverted join (into right side)
├── cost: 1e+100
├── scan a
│ └── cost: 1078.52
├── scan b
│ └── cost: 1068.42
└── filters
└── k = x

New expression 1 of 1:
right-join (hash)
├── flags: disallow hash join (store left side) and lookup join (into left side) and inverted join (into left side)
├── cost: 2187.10625
├── scan b
│ └── cost: 1068.42
├── scan a
│ └── cost: 1078.52
└── filters
└── k = x
----
----

# Verify that we pick inverted join if we force it.
opt
SELECT * FROM g AS g1 INNER INVERTED JOIN g AS g2 ON ST_Contains(g1.geom, g2.geom)
Expand Down

0 comments on commit 365ea77

Please sign in to comment.