Skip to content

Commit

Permalink
fix(cubestore): 'unsorted data' assertion with high-precision timestamps
Browse files Browse the repository at this point in the history
CubeStore used to truncate timestamps to millisecond precision when
writing to parquet, but sort the data with nanosecond precision.

This led to 'unsorted data in merge' assertions.

Ensure we truncate before we sort the data.
Increasing the storage precision is another option, but that involves
backward and forward compatibility issues and requires more planning.
So stick with the current behavior for now.

If you see 'unmerged data' assertion in the logs, you have to manually
drop the tables where this happens, e.g. by rebuilding the rollups in
CubeJS.
  • Loading branch information
ilya-biryukov committed Aug 25, 2021
1 parent 9c0d4fe commit 58a8cb4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
37 changes: 37 additions & 0 deletions rust/cubestore-sql-tests/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ pub fn sql_tests() -> Vec<(&'static str, TestFn)> {
t("now", now),
t("dump", dump),
t("unsorted_merge_assertion", unsorted_merge_assertion),
t("unsorted_data_timestamps", unsorted_data_timestamps),
];

fn t<F>(name: &'static str, f: fn(Box<dyn SqlClient>) -> F) -> (&'static str, TestFn)
Expand Down Expand Up @@ -3332,6 +3333,42 @@ async fn unsorted_merge_assertion(service: Box<dyn SqlClient>) {
assert_eq!(to_rows(&r), rows(&[(3, 2, 2), (2, 3, 2), (1, 4, 2)]));
}

async fn unsorted_data_timestamps(service: Box<dyn SqlClient>) {
service.exec_query("CREATE SCHEMA s").await.unwrap();
service
.exec_query("CREATE TABLE s.data(t timestamp, n string)")
.await
.unwrap();
service
.exec_query(
"INSERT INTO s.data(t, n) VALUES \
('2020-01-01T00:00:00.000000005Z', 'a'), \
('2020-01-01T00:00:00.000000001Z', 'b'), \
('2020-01-01T00:00:00.000000002Z', 'c')",
)
.await
.unwrap();

// CubeStore currently truncs timestamps to millisecond precision.
// This checks we sort trunced precisions on inserts. We rely on implementation details of
// CubeStore here.
let r = service.exec_query("SELECT t, n FROM s.data").await.unwrap();

let t = timestamp_from_string("2020-01-01T00:00:00Z").unwrap();
assert_eq!(to_rows(&r), rows(&[(t, "a"), (t, "b"), (t, "c")]));

// This ends up using MergeSortExec, make sure we see no assertions.
let r = service
.exec_query(
"SELECT t, n FROM (SELECT * FROM s.data UNION ALL SELECT * FROM s.data) data \
GROUP BY 1, 2 \
ORDER BY 1, 2",
)
.await
.unwrap();
assert_eq!(to_rows(&r), rows(&[(t, "a"), (t, "b"), (t, "c")]));
}

async fn now(service: Box<dyn SqlClient>) {
let r = service.exec_query("SELECT now()").await.unwrap();
assert_eq!(r.get_rows().len(), 1);
Expand Down
5 changes: 4 additions & 1 deletion rust/cubestore/src/table/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ pub struct TimestampValue {
}

impl TimestampValue {
pub fn new(unix_nano: i64) -> TimestampValue {
pub fn new(mut unix_nano: i64) -> TimestampValue {
// This is a hack to workaround a mismatch between on-disk and in-memory representations.
// We use millisecond precision on-disk.
unix_nano -= unix_nano % 1000;
TimestampValue { unix_nano }
}

Expand Down

0 comments on commit 58a8cb4

Please sign in to comment.