From cf339f7966929cd83682de747f8e7a6ad7f82c25 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Mon, 10 Jun 2024 21:08:02 -0700
Subject: [PATCH] rowcontainer: prevent using key encoding for tuples

This commit fixes a bug that could produce incorrect results or decoding
errors when we spilled tuples to disk in the row-by-row engine and used
the key encoding for it. The problem is that the tuple key encoding is
currently faulty, for example, it could make `encoding.PeekLength`
behave incorrectly which later would corrupt the datum for the next
column. We now avoid this problematic behavior whenever we're spilling
to disk and we need to sort by the tuple type. As a concrete example
from the reproduction test:
- we have tuple `(NULL, NULL, NULL)` followed by an interval datum
- when encoding it we get `[0, 0, 0, 22, ...]` since
each NULL within the tuple gets the NULL marker
- when decoding the spilled key-value pair, we see that the first
encoded value has the NULL marker, so we only skip 1 byte
- this leaves `[0, 0, 22, ...]` to be decoded as an interval resulting
in a datum corruption.

The faulty key encoding has existed since the dawn of times (added in
df53f5b6459095a6d39234e6a9d9c3bfd3ed36b0). The good news is that we
cannot store tuples in the tables, so fixing the encoding would only
affect the in-memory representation. The bad news is the tuple
comparison rules which AFAICT would prohibit us from adding the number
of elements in the tuple or the total encoding length in the encoding
prefix.

The bug only affects row-by-row engine when spilling to disk, which is
a non-default configuration, so the release note is omitted. (The
vectorized engine is unaffected since it doesn't use the row containers
and uses values encoding when non-native datums when spilling to disk.)

Release note: None
---
 .../testdata/logic_test/suboperators          |  8 +++++++
 pkg/sql/logictest/testdata/logic_test/tuple   | 24 +++++++++++++++++++
 pkg/sql/logictest/testdata/logic_test/void    |  1 +
 pkg/sql/rowcontainer/disk_row_container.go    | 16 +++++++++----
 pkg/sql/rowenc/keyside/encode.go              |  6 +++++
 5 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/pkg/sql/logictest/testdata/logic_test/suboperators b/pkg/sql/logictest/testdata/logic_test/suboperators
index 1f1495e81d01..ef5385766955 100644
--- a/pkg/sql/logictest/testdata/logic_test/suboperators
+++ b/pkg/sql/logictest/testdata/logic_test/suboperators
@@ -495,8 +495,16 @@ SELECT a, b FROM abc WHERE (a, b) = ANY(ARRAY(SELECT ROW(a, b) FROM abc LIMIT 1)
 ----
 1  10
 
+# The next query might error out when spilling to disk in the row-by-row engine,
+# so we only use vectorized engine for it.
+statement ok
+SET vectorize = on;
+
 query T rowsort
 SELECT * FROM comp WHERE v IN (SELECT v FROM comp);
 ----
 (1,2)
 (3,4)
+
+statement ok
+RESET vectorize;
diff --git a/pkg/sql/logictest/testdata/logic_test/tuple b/pkg/sql/logictest/testdata/logic_test/tuple
index 317236a44a37..30748cea7434 100644
--- a/pkg/sql/logictest/testdata/logic_test/tuple
+++ b/pkg/sql/logictest/testdata/logic_test/tuple
@@ -1283,3 +1283,27 @@ NULL
 (,,1100)
 (,1.1,)
 (1,,)
+
+# Regression test for incorrectly spilling tuples to disk which produces
+# "encoding corruption" of other columns (#125367).
+statement ok
+CREATE TABLE t125367 AS SELECT
+    g AS _int8, g * '1 day'::INTERVAL AS _interval, g::DECIMAL AS _decimal
+    FROM generate_series(1, 5) AS g;
+UPDATE t125367 SET _interval = '7 years 1 mon 887 days 18:22:39.99567';
+SET testing_optimizer_random_seed = 1481092000980190599;
+SET testing_optimizer_disable_rule_probability = 1.000000;
+SET vectorize = off;
+SET distsql_workmem = '2B';
+
+statement error pgcode 0A000 unimplemented: can't spill column type RECORD to disk
+SELECT tab_1._interval AS col_1, (NULL:::STRING, NULL:::JSONB, NULL:::TIME) AS col_2
+    FROM t125367 AS tab_2 JOIN t125367 AS tab_1 ON (tab_2._int8) = (tab_1._int8)
+    ORDER BY col_2, tab_2._interval, tab_2._decimal, col_1 DESC
+    LIMIT 1000;
+
+statement ok
+RESET testing_optimizer_random_seed;
+RESET testing_optimizer_disable_rule_probability;
+RESET vectorize;
+RESET distsql_workmem;
diff --git a/pkg/sql/logictest/testdata/logic_test/void b/pkg/sql/logictest/testdata/logic_test/void
index 0dbd6a619071..a245e02d6de9 100644
--- a/pkg/sql/logictest/testdata/logic_test/void
+++ b/pkg/sql/logictest/testdata/logic_test/void
@@ -139,6 +139,7 @@ ORDER BY
 
 # Regression test for #83754.
 # Non-vectorized and vectorized results should match.
+skipif config fakedist-disk
 query T
 SELECT
   COALESCE(tab_115318.col_199168, NULL) AS col_199169
diff --git a/pkg/sql/rowcontainer/disk_row_container.go b/pkg/sql/rowcontainer/disk_row_container.go
index c2e36baeb0fe..b214a0e9cabf 100644
--- a/pkg/sql/rowcontainer/disk_row_container.go
+++ b/pkg/sql/rowcontainer/disk_row_container.go
@@ -102,7 +102,7 @@ func MakeDiskRowContainer(
 	typs []*types.T,
 	ordering colinfo.ColumnOrdering,
 	e diskmap.Factory,
-) (DiskRowContainer, error) {
+) (_ DiskRowContainer, retErr error) {
 	diskMap := e.NewSortedDiskMap()
 	d := DiskRowContainer{
 		diskMap:      diskMap,
@@ -114,6 +114,13 @@ func MakeDiskRowContainer(
 		engine:       e,
 		datumAlloc:   &tree.DatumAlloc{},
 	}
+	defer func() {
+		if retErr != nil {
+			// Ensure to close the container since we're not returning it to the
+			// caller.
+			d.Close(ctx)
+		}
+	}()
 
 	// The ordering is specified for a subset of the columns. These will be
 	// encoded as a key in the given order according to the given direction so
@@ -144,12 +151,13 @@ func MakeDiskRowContainer(
 		d.encodings[i] = rowenc.EncodingDirToDatumEncoding(orderInfo.Direction)
 		switch t := typs[orderInfo.ColIdx]; t.Family() {
 		case types.TSQueryFamily, types.TSVectorFamily:
-			// Ensure to close the container since we're not returning it to the
-			// caller.
-			d.Close(ctx)
 			return DiskRowContainer{}, unimplemented.NewWithIssueDetailf(
 				92165, "", "can't order by column type %s", t.SQLStringForError(),
 			)
+		case types.TupleFamily:
+			return DiskRowContainer{}, unimplemented.NewWithIssueDetailf(
+				49975, "", "can't spill column type %s to disk", t.SQLStringForError(),
+			)
 		}
 	}
 
diff --git a/pkg/sql/rowenc/keyside/encode.go b/pkg/sql/rowenc/keyside/encode.go
index e08c556982a9..7d1791559a04 100644
--- a/pkg/sql/rowenc/keyside/encode.go
+++ b/pkg/sql/rowenc/keyside/encode.go
@@ -145,6 +145,12 @@ func Encode(b []byte, val tree.Datum, dir encoding.Direction) ([]byte, error) {
 		}
 		return encoding.EncodeBytesDescending(b, data), nil
 	case *tree.DTuple:
+		// TODO(49975): due to the fact that we're not adding any "tuple
+		// marker", this encoding is faulty since it can lead to incorrect
+		// decoding: e.g. tuple (NULL, NULL) is encoded as [0, 0], but when
+		// decoding it, encoding.PeekLength will return 1 leaving the second
+		// zero in the buffer which could later result in corruption of the
+		// datum for the next column.
 		for _, datum := range t.D {
 			var err error
 			b, err = Encode(b, datum, dir)