rowcontainer: prevent using key encoding for tuples

This commit fixes a bug that could produce incorrect results or decoding errors when we spilled tuples to disk in the row-by-row engine and used the key encoding for it. The problem is that the tuple key encoding is currently faulty, for example, it could make `encoding.PeekLength` behave incorrectly which later would corrupt the datum for the next column. We now avoid this problematic behavior whenever we're spilling to disk and we need to sort by the tuple type. As a concrete example from the reproduction test: - we have tuple `(NULL, NULL, NULL)` followed by an interval datum - when encoding it we get `[0, 0, 0, 22, ...]` since each NULL within the tuple gets the NULL marker - when decoding the spilled key-value pair, we see that the first encoded value has the NULL marker, so we only skip 1 byte - this leaves `[0, 0, 22, ...]` to be decoded as an interval resulting in a datum corruption. The faulty key encoding has existed since the dawn of times (added in df53f5b). The good news is that we cannot store tuples in the tables, so fixing the encoding would only affect the in-memory representation. The bad news is the tuple comparison rules which AFAICT would prohibit us from adding the number of elements in the tuple or the total encoding length in the encoding prefix. The bug only affects row-by-row engine when spilling to disk, which is a non-default configuration, so the release note is omitted. (The vectorized engine is unaffected since it doesn't use the row containers and uses values encoding when non-native datums when spilling to disk.) Release note: None
cockroachdb · Jun 12, 2024 · cf339f7 · cf339f7
1 parent 41c37cb
commit cf339f7
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 4 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/suboperators b/pkg/sql/logictest/testdata/logic_test/suboperators
@@ -495,8 +495,16 @@ SELECT a, b FROM abc WHERE (a, b) = ANY(ARRAY(SELECT ROW(a, b) FROM abc LIMIT 1)
 ----
 1  10
 
+# The next query might error out when spilling to disk in the row-by-row engine,
+# so we only use vectorized engine for it.
+statement ok
+SET vectorize = on;
+
 query T rowsort
 SELECT * FROM comp WHERE v IN (SELECT v FROM comp);
 ----
 (1,2)
 (3,4)
+
+statement ok
+RESET vectorize;
diff --git a/pkg/sql/logictest/testdata/logic_test/tuple b/pkg/sql/logictest/testdata/logic_test/tuple
@@ -1283,3 +1283,27 @@ NULL
 (,,1100)
 (,1.1,)
 (1,,)
+
+# Regression test for incorrectly spilling tuples to disk which produces
+# "encoding corruption" of other columns (#125367).
+statement ok
+CREATE TABLE t125367 AS SELECT
+    g AS _int8, g * '1 day'::INTERVAL AS _interval, g::DECIMAL AS _decimal
+    FROM generate_series(1, 5) AS g;
+UPDATE t125367 SET _interval = '7 years 1 mon 887 days 18:22:39.99567';
+SET testing_optimizer_random_seed = 1481092000980190599;
+SET testing_optimizer_disable_rule_probability = 1.000000;
+SET vectorize = off;
+SET distsql_workmem = '2B';
+
+statement error pgcode 0A000 unimplemented: can't spill column type RECORD to disk
+SELECT tab_1._interval AS col_1, (NULL:::STRING, NULL:::JSONB, NULL:::TIME) AS col_2
+    FROM t125367 AS tab_2 JOIN t125367 AS tab_1 ON (tab_2._int8) = (tab_1._int8)
+    ORDER BY col_2, tab_2._interval, tab_2._decimal, col_1 DESC
+    LIMIT 1000;
+
+statement ok
+RESET testing_optimizer_random_seed;
+RESET testing_optimizer_disable_rule_probability;
+RESET vectorize;
+RESET distsql_workmem;
diff --git a/pkg/sql/logictest/testdata/logic_test/void b/pkg/sql/logictest/testdata/logic_test/void
@@ -139,6 +139,7 @@ ORDER BY
 
 # Regression test for #83754.
 # Non-vectorized and vectorized results should match.
+skipif config fakedist-disk
 query T
 SELECT
   COALESCE(tab_115318.col_199168, NULL) AS col_199169

diff --git a/pkg/sql/rowcontainer/disk_row_container.go b/pkg/sql/rowcontainer/disk_row_container.go
@@ -102,7 +102,7 @@ func MakeDiskRowContainer(
 	typs []*types.T,
 	ordering colinfo.ColumnOrdering,
 	e diskmap.Factory,
-) (DiskRowContainer, error) {
+) (_ DiskRowContainer, retErr error) {
 	diskMap := e.NewSortedDiskMap()
 	d := DiskRowContainer{
 		diskMap:      diskMap,
@@ -114,6 +114,13 @@ func MakeDiskRowContainer(
 		engine:       e,
 		datumAlloc:   &tree.DatumAlloc{},
 	}
+	defer func() {
+		if retErr != nil {
+			// Ensure to close the container since we're not returning it to the
+			// caller.
+			d.Close(ctx)
+		}
+	}()
 
 	// The ordering is specified for a subset of the columns. These will be
 	// encoded as a key in the given order according to the given direction so
@@ -144,12 +151,13 @@ func MakeDiskRowContainer(
 		d.encodings[i] = rowenc.EncodingDirToDatumEncoding(orderInfo.Direction)
 		switch t := typs[orderInfo.ColIdx]; t.Family() {
 		case types.TSQueryFamily, types.TSVectorFamily:
-			// Ensure to close the container since we're not returning it to the
-			// caller.
-			d.Close(ctx)
 			return DiskRowContainer{}, unimplemented.NewWithIssueDetailf(
 				92165, "", "can't order by column type %s", t.SQLStringForError(),
 			)
+		case types.TupleFamily:
+			return DiskRowContainer{}, unimplemented.NewWithIssueDetailf(
+				49975, "", "can't spill column type %s to disk", t.SQLStringForError(),
+			)
 		}
 	}
 

diff --git a/pkg/sql/rowenc/keyside/encode.go b/pkg/sql/rowenc/keyside/encode.go
@@ -145,6 +145,12 @@ func Encode(b []byte, val tree.Datum, dir encoding.Direction) ([]byte, error) {
 		}
 		return encoding.EncodeBytesDescending(b, data), nil
 	case *tree.DTuple:
+		// TODO(49975): due to the fact that we're not adding any "tuple
+		// marker", this encoding is faulty since it can lead to incorrect
+		// decoding: e.g. tuple (NULL, NULL) is encoded as [0, 0], but when
+		// decoding it, encoding.PeekLength will return 1 leaving the second
+		// zero in the buffer which could later result in corruption of the
+		// datum for the next column.
 		for _, datum := range t.D {
 			var err error
 			b, err = Encode(b, datum, dir)