Skip to content

Commit

Permalink
Make START TRANSACTION WITH CONSISTENT ROCKSDB SNAPSHOT read only and…
Browse files Browse the repository at this point in the history
… use DB::GetSnapshot

Summary:
RocksDB's diff D50475 had a side effect that SingleDelete entries
weren't removed if there was a long running transactions opened via
RocksDB Transaction API. To prevent this issue, creating a snapshot
without using Transaction API is needed, but the transaction has to be
read only. Practical use case of the long running transaction is
logical backup, which uses START TRANSACTION WITH CONSISTENT ROCKSDB
SNAPSHOT.
This diff makes the transaction read only, and raising an error if doing
any update within the transaction. This restriction has to be
documented.

@update-submodule: rocksdb

Test Plan: mtr, new test case rocksdb.read_only_tx

Reviewers: spetrunia, jkedgar, hermanlee4

Reviewed By: hermanlee4

Subscribers: anthony, webscalesql-eng

Differential Revision: https://reviews.facebook.net/D51945
  • Loading branch information
yoshinorim authored and jtolmer committed Jan 5, 2016
1 parent 5315d1f commit c8b0c7a
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 4 deletions.
Expand Up @@ -131,6 +131,7 @@ id value value2
START TRANSACTION WITH CONSISTENT SNAPSHOT;
ERROR: 1105
INSERT INTO r1 values (11,11,11);
ERROR: 0
SELECT * FROM r1;
id value value2
1 1 1
Expand Down
Expand Up @@ -125,6 +125,7 @@ id value value2
START TRANSACTION WITH CONSISTENT SNAPSHOT;
ERROR: 0
INSERT INTO r1 values (11,11,11);
ERROR: 1105
SELECT * FROM r1;
id value value2
1 1 1
Expand All @@ -137,7 +138,6 @@ id value value2
8 8 8
9 9 9
10 10 10
11 11 11
drop table r1;
connection default;
disconnect con1;
Expand Down
38 changes: 38 additions & 0 deletions mysql-test/suite/rocksdb/r/read_only_tx.result
@@ -0,0 +1,38 @@
DROP TABLE IF EXISTS t1;
CREATE TABLE t1 (id INT, value int, PRIMARY KEY (id), INDEX (value)) ENGINE=RocksDB;
INSERT INTO t1 VALUES (1,1);
select variable_value into @p from information_schema.global_status where variable_name='rocksdb_number_sst_entry_put';
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
START TRANSACTION WITH CONSISTENT ROCKSDB SNAPSHOT;
File Position Gtid_executed
master-bin.000001 734 uuid:1-3
select case when variable_value-@p < 1000 then 'true' else variable_value-@p end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_put';
case when variable_value-@p < 1000 then 'true' else variable_value-@p end
true
select case when variable_value-@s < 100 then 'true' else variable_value-@s end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
case when variable_value-@s < 100 then 'true' else variable_value-@s end
true
SELECT * FROM t1;
id value
1 1
INSERT INTO t1 values (2, 2);
ERROR HY000: Can't execute updates when you started a transaction with START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT.
ROLLBACK;
SELECT * FROM t1;
id value
1 10001
INSERT INTO t1 values (2, 2);
SELECT * FROM t1 ORDER BY id;
id value
1 10001
2 2
BEGIN;
SELECT COUNT(*) FROM t1;
COUNT(*)
9998
COMMIT;
OPTIMIZE TABLE t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
DROP TABLE t1;
reset master;
3 changes: 3 additions & 0 deletions mysql-test/suite/rocksdb/t/consistent_snapshot.inc
Expand Up @@ -118,7 +118,10 @@ SELECT * FROM r1; # 9
--error 0,ER_UNKNOWN_ERROR
START TRANSACTION WITH CONSISTENT SNAPSHOT;
--echo ERROR: $mysql_errno
# Succeeds with Read Committed, Fails with Repeatable Read
--error 0,ER_UNKNOWN_ERROR
INSERT INTO r1 values (11,11,11);
--echo ERROR: $mysql_errno
SELECT * FROM r1; # self changes should be visible


Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/rocksdb/t/read_only_tx-master.opt
@@ -0,0 +1 @@
--rocksdb_default_cf_options=write_buffer_size=16k --log-bin --binlog_format=row --gtid_mode=ON --enforce_gtid_consistency --log-slave-updates
70 changes: 70 additions & 0 deletions mysql-test/suite/rocksdb/t/read_only_tx.test
@@ -0,0 +1,70 @@
--source include/have_log_bin.inc
--source include/have_rocksdb.inc
--source include/count_sessions.inc
--disable_warnings
--source include/have_gtid.inc
--enable_warnings
-- let $uuid = `select @@server_uuid;`

--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings

connect (con1,localhost,root,,);
connect (con2,localhost,root,,);

connection con1;
CREATE TABLE t1 (id INT, value int, PRIMARY KEY (id), INDEX (value)) ENGINE=RocksDB;
INSERT INTO t1 VALUES (1,1);

# Read-only, long-running transaction. SingleDelete/Put shouldn't increase much.
select variable_value into @p from information_schema.global_status where variable_name='rocksdb_number_sst_entry_put';
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
-- replace_result $uuid uuid
START TRANSACTION WITH CONSISTENT ROCKSDB SNAPSHOT;

connection con2;
--disable_query_log
let $i = 1;
while ($i <= 10000) {
let $update = UPDATE t1 SET value=value+1 WHERE id=1;
inc $i;
eval $update;
}
--enable_query_log

connection con1;
select case when variable_value-@p < 1000 then 'true' else variable_value-@p end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_put';
select case when variable_value-@s < 100 then 'true' else variable_value-@s end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
SELECT * FROM t1;
--error ER_UNKNOWN_ERROR
INSERT INTO t1 values (2, 2);
ROLLBACK;
SELECT * FROM t1;
INSERT INTO t1 values (2, 2);
SELECT * FROM t1 ORDER BY id;

# Regular long-running transaction.
# No "Transaction could not check for conflicts for opearation" error should happen.
BEGIN;

connection con2;
--disable_query_log
let $i = 5;
while ($i <= 10000) {
let $insert = INSERT INTO t1 VALUES ($i, $i);
inc $i;
eval $insert;
}
--enable_query_log

connection con1;
SELECT COUNT(*) FROM t1;
COMMIT;

connection default;
disconnect con1;
disconnect con2;
OPTIMIZE TABLE t1;
DROP TABLE t1;
reset master;
45 changes: 43 additions & 2 deletions storage/rocksdb/ha_rocksdb.cc
Expand Up @@ -1193,6 +1193,7 @@ class Rdb_transaction

bool is_trx_failed= false;
bool is_delayed_snapshot= false;
bool tx_read_only= false;

THD* thd_= nullptr;

Expand Down Expand Up @@ -1301,6 +1302,7 @@ class Rdb_transaction
delete txn;
txn= NULL;
n_writes= 0;
set_tx_read_only(false);
return res;
}

Expand All @@ -1315,6 +1317,7 @@ class Rdb_transaction
txn->Rollback();
delete txn;
txn= NULL;
set_tx_read_only(false);
}
}

Expand All @@ -1328,7 +1331,10 @@ class Rdb_transaction
void acquire_snapshot(bool acquire_now)
{
if (read_opts.snapshot == nullptr) {
if (acquire_now) {
if (is_tx_read_only()) {
snapshot_created(rdb->GetSnapshot());
}
else if (acquire_now) {
txn->SetSnapshot();
snapshot_created(txn->GetSnapshot());
}
Expand All @@ -1346,8 +1352,16 @@ class Rdb_transaction
if (read_opts.snapshot != nullptr)
{
register_snapshot_release(read_opts.snapshot);
if (is_tx_read_only())
{
rdb->ReleaseSnapshot(read_opts.snapshot);
need_clear = false;
}
else
{
need_clear = true;
}
read_opts.snapshot = nullptr;
need_clear = true;
}

if (need_clear && txn != nullptr)
Expand Down Expand Up @@ -1535,6 +1549,22 @@ class Rdb_transaction
}
}

/*
This is used by transactions started with "START TRANSACTION WITH "
"CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
snapshot has to be created via DB::GetSnapshot(), not via Transaction
API.
*/
bool is_tx_read_only() const
{
return tx_read_only;
}

void set_tx_read_only(bool val)
{
tx_read_only= val;
}

explicit Rdb_transaction(THD *thd) : txn(nullptr), thd_(thd)
{
// Create a notifier that can be called when a snapshot gets generated.
Expand Down Expand Up @@ -1977,6 +2007,7 @@ static int rocksdb_start_trx_and_assign_read_view(
}

Rdb_transaction* trx= get_or_create_trx(thd);
trx->set_tx_read_only(true);
rocksdb_register_trx(hton, thd, trx);
trx->acquire_snapshot(true);

Expand Down Expand Up @@ -5936,6 +5967,16 @@ int ha_rocksdb::external_lock(THD *thd, int lock_type)

if (lock_type == F_WRLCK)
{
if (trx->is_tx_read_only())
{
my_printf_error(ER_UNKNOWN_ERROR,
"Can't execute updates when you started a transaction "
"with START TRANSACTION WITH CONSISTENT [ROCKSDB] "
"SNAPSHOT.",
MYF(0));
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
}

/*
SQL layer signals us to take a write lock. It does so when starting DML
statement. We should put locks on the rows we're reading.
Expand Down

0 comments on commit c8b0c7a

Please sign in to comment.