Skip to content
Permalink
Browse files

Supporting SingleDelete in MyRocks

Summary:
RocksDB supported SingleDelete operations which
helps to reduce tombstones. SingleDelete can be used
when keys are guaranteed to be never overwritten.
In MyRocks, keys are never overwritten for 1. secondary keys
and 2. primary keys if all columns are covered by the primary key.
This diff adds SingleDelete support for both cases. This diff
also added status variables to count the number of
Put/Delete/SingleDelete/etc when sst files were written. These counters
are useful for debugging, and new test case relies on these counters.

Test Plan: mtr and new test case rocksdb.singledelete

Reviewers: maykov, spetrunia, jkedgar, hermanlee4

Reviewed By: hermanlee4

Differential Revision: https://reviews.facebook.net/D48015
Differential Revision: https://reviews.facebook.net/D50277
  • Loading branch information...
yoshinorim authored and hermanlee committed Oct 2, 2015
1 parent ec717d1 commit bd338db88062765c39433738363d9eb564819eca
@@ -1431,6 +1431,11 @@ rocksdb_number_multiget_bytes_read #
rocksdb_number_multiget_get #
rocksdb_number_multiget_keys_read #
rocksdb_number_reseeks_iteration #
rocksdb_number_sst_entry_delete #
rocksdb_number_sst_entry_merge #
rocksdb_number_sst_entry_other #
rocksdb_number_sst_entry_put #
rocksdb_number_sst_entry_singledelete #
rocksdb_number_stat_computes #
rocksdb_number_superversion_acquires #
rocksdb_number_superversion_cleanups #
@@ -1487,6 +1492,11 @@ ROCKSDB_NUMBER_MULTIGET_BYTES_READ
ROCKSDB_NUMBER_MULTIGET_GET
ROCKSDB_NUMBER_MULTIGET_KEYS_READ
ROCKSDB_NUMBER_RESEEKS_ITERATION
ROCKSDB_NUMBER_SST_ENTRY_DELETE
ROCKSDB_NUMBER_SST_ENTRY_MERGE
ROCKSDB_NUMBER_SST_ENTRY_OTHER
ROCKSDB_NUMBER_SST_ENTRY_PUT
ROCKSDB_NUMBER_SST_ENTRY_SINGLEDELETE
ROCKSDB_NUMBER_STAT_COMPUTES
ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
@@ -1545,6 +1555,11 @@ ROCKSDB_NUMBER_MULTIGET_BYTES_READ
ROCKSDB_NUMBER_MULTIGET_GET
ROCKSDB_NUMBER_MULTIGET_KEYS_READ
ROCKSDB_NUMBER_RESEEKS_ITERATION
ROCKSDB_NUMBER_SST_ENTRY_DELETE
ROCKSDB_NUMBER_SST_ENTRY_MERGE
ROCKSDB_NUMBER_SST_ENTRY_OTHER
ROCKSDB_NUMBER_SST_ENTRY_PUT
ROCKSDB_NUMBER_SST_ENTRY_SINGLEDELETE
ROCKSDB_NUMBER_STAT_COMPUTES
ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
@@ -0,0 +1,66 @@
CREATE TABLE t1 (id INT, value int, PRIMARY KEY (id), INDEX (value)) ENGINE=RocksDB;
INSERT INTO t1 VALUES (1,1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end
true
select case when variable_value-@d < 10 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
case when variable_value-@d < 10 then 'true' else 'false' end
true
CREATE TABLE t2 (id INT, value int, PRIMARY KEY (id), INDEX (value)) ENGINE=RocksDB;
INSERT INTO t2 VALUES (1,1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
optimize table t2;
Table Op Msg_type Msg_text
test.t2 optimize status OK
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end
true
select case when variable_value-@d > 9000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
case when variable_value-@d > 9000 then 'true' else 'false' end
true
CREATE TABLE t3 (id INT, value int, PRIMARY KEY (id)) ENGINE=RocksDB;
INSERT INTO t3 VALUES (1,1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
optimize table t3;
Table Op Msg_type Msg_text
test.t3 optimize status OK
select case when variable_value-@s = 0 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
case when variable_value-@s = 0 then 'true' else 'false' end
true
select case when variable_value-@d > 9000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
case when variable_value-@d > 9000 then 'true' else 'false' end
true
CREATE TABLE t4 (id INT, PRIMARY KEY (id)) ENGINE=RocksDB;
INSERT INTO t4 VALUES (1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
optimize table t4;
Table Op Msg_type Msg_text
test.t4 optimize status OK
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end
true
select case when variable_value-@d < 10 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
case when variable_value-@d < 10 then 'true' else 'false' end
true
CREATE TABLE t5 (id1 INT, id2 INT, PRIMARY KEY (id1, id2), INDEX(id2)) ENGINE=RocksDB;
INSERT INTO t5 VALUES (1, 1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
optimize table t5;
Table Op Msg_type Msg_text
test.t5 optimize status OK
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end
true
select case when variable_value-@d < 10 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
case when variable_value-@d < 10 then 'true' else 'false' end
true
DROP TABLE t1, t2, t3, t4, t5;
@@ -16,7 +16,7 @@ let $i=0;
while ($i<1000)
{
inc $i;
eval update r1 set value2=value2+1 where id1=500;
eval update r1 set id2=id2+10000 where id1=500;
}
--enable_query_log
set global rocksdb_force_flush_memtable_now=1;
@@ -0,0 +1 @@
--rocksdb_default_cf_options=write_buffer_size=16k
@@ -0,0 +1,89 @@
--source include/have_rocksdb.inc

# only SingleDelete increases
CREATE TABLE t1 (id INT, value int, PRIMARY KEY (id), INDEX (value)) ENGINE=RocksDB;
INSERT INTO t1 VALUES (1,1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
--disable_query_log
let $i = 1;
while ($i <= 10000) {
let $update = UPDATE t1 SET value=value+1 WHERE value=$i;
inc $i;
eval $update;
}
--enable_query_log
optimize table t1;
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select case when variable_value-@d < 10 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';


# both SingleDelete and Delete increases
CREATE TABLE t2 (id INT, value int, PRIMARY KEY (id), INDEX (value)) ENGINE=RocksDB;
INSERT INTO t2 VALUES (1,1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
--disable_query_log
let $i = 1;
while ($i <= 10000) {
let $update = UPDATE t2 SET id=id+1 WHERE id=$i;
inc $i;
eval $update;
}
--enable_query_log
optimize table t2;
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select case when variable_value-@d > 9000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';

# only Delete increases
CREATE TABLE t3 (id INT, value int, PRIMARY KEY (id)) ENGINE=RocksDB;
INSERT INTO t3 VALUES (1,1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
--disable_query_log
let $i = 1;
while ($i <= 10000) {
let $update = UPDATE t3 SET id=id+1 WHERE id=$i;
inc $i;
eval $update;
}
--enable_query_log
optimize table t3;
select case when variable_value-@s = 0 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select case when variable_value-@d > 9000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';

# only SingleDelete increases
CREATE TABLE t4 (id INT, PRIMARY KEY (id)) ENGINE=RocksDB;
INSERT INTO t4 VALUES (1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
--disable_query_log
let $i = 1;
while ($i <= 10000) {
let $update = UPDATE t4 SET id=id+1 WHERE id=$i;
inc $i;
eval $update;
}
--enable_query_log
optimize table t4;
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select case when variable_value-@d < 10 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';

# only SingleDelete increases
CREATE TABLE t5 (id1 INT, id2 INT, PRIMARY KEY (id1, id2), INDEX(id2)) ENGINE=RocksDB;
INSERT INTO t5 VALUES (1, 1);
select variable_value into @s from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select variable_value into @d from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';
--disable_query_log
let $i = 1;
while ($i <= 10000) {
let $update = UPDATE t5 SET id1=id1+1 WHERE id1=$i;
inc $i;
eval $update;
}
--enable_query_log
optimize table t5;
select case when variable_value-@s > 10 and variable_value-@s < 100 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_singledelete';
select case when variable_value-@d < 10 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_number_sst_entry_delete';

DROP TABLE t1, t2, t3, t4, t5;
@@ -1212,6 +1212,14 @@ class Rdb_transaction
return txn->Delete(column_family, key);
}

rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle* column_family,
const rocksdb::Slice& key)
{
if (++n_writes > max_row_locks)
return rocksdb::Status::Aborted(rocksdb::Slice(err_too_many_locks));
return txn->SingleDelete(column_family, key);
}

rocksdb::Status Get(rocksdb::ColumnFamilyHandle* column_family,
const rocksdb::Slice& key, std::string* value)
{
@@ -4469,6 +4477,17 @@ void ha_rocksdb::unlock_row()
}
}

/*
Returning true if SingleDelete can be used.
- Secondary Indexes can always use SingleDelete.
- If the index is PRIMARY KEY, and if all of the columns of the table
are covered by the PRIMARY KEY, SingleDelete can be used.
*/
bool ha_rocksdb::can_use_single_delete(uint index)
{
return (index != table->s->primary_key ||
table->key_info[index].actual_key_parts == table->s->fields);
}

int ha_rocksdb::write_row(uchar *buf)
{
@@ -4711,8 +4730,9 @@ int ha_rocksdb::update_write_row(const uchar *old_data,
{
if (pk_key_changed)
{
rocksdb::Status s= trx->Delete(key_descr[i]->get_cf(),
old_pk_key_slice);
rocksdb::Status s= delete_or_singledelete(i, trx,
key_descr[i]->get_cf(),
old_pk_key_slice);
if (!s.ok())
{
DBUG_RETURN(return_status_error(table->in_use, s));
@@ -4769,7 +4789,9 @@ int ha_rocksdb::update_write_row(const uchar *old_data,
memcmp(sec_key_tails_old, sec_key_tails, old_key_tail_size) == 0)
continue;

auto s= trx->Delete(key_descr[i]->get_cf(), old_key_slice);
rocksdb::Status s= delete_or_singledelete(i, trx,
key_descr[i]->get_cf(),
old_key_slice);
if (!s.ok())
{
DBUG_RETURN(return_status_error(table->in_use, s));
@@ -5050,9 +5072,9 @@ int ha_rocksdb::delete_row(const uchar *buf)

rocksdb::Slice key_slice(last_rowkey.ptr(), last_rowkey.length());
Rdb_transaction *trx= get_or_create_trx(table->in_use);
rocksdb::Status s;
s= trx->Delete(pk_descr->get_cf(), key_slice);

rocksdb::Status s= delete_or_singledelete(table->s->primary_key, trx,
pk_descr->get_cf(),
key_slice);
if (!s.ok())
{
DBUG_RETURN(return_status_error(table->in_use, s));
@@ -5069,7 +5091,7 @@ int ha_rocksdb::delete_row(const uchar *buf)
NULL, NULL);
rocksdb::Slice secondary_key_slice((char*)sec_key_packed_tuple,
packed_size);
s= trx->Delete(key_descr[i]->get_cf(), secondary_key_slice);
s= trx->SingleDelete(key_descr[i]->get_cf(), secondary_key_slice);
if (!s.ok())
{
DBUG_RETURN(return_status_error(table->in_use, s));
@@ -5083,6 +5105,17 @@ int ha_rocksdb::delete_row(const uchar *buf)
DBUG_RETURN(0);
}

rocksdb::Status
ha_rocksdb::delete_or_singledelete(uint index,
Rdb_transaction *trx,
rocksdb::ColumnFamilyHandle* column_family,
const rocksdb::Slice& key)
{
if (can_use_single_delete(index))
return trx->SingleDelete(column_family, key);
return trx->Delete(column_family, key);
}


int ha_rocksdb::info(uint flag)
{
@@ -5661,7 +5694,10 @@ void ha_rocksdb::remove_rows(RDBSE_TABLE_DEF *tbl)
rocksdb::Slice key= it->key();
if (!tbl->key_descr[i]->covers_key(key.data(), key.size()))
break;
rdb->Delete(wo, key);
if (can_use_single_delete(i))
rdb->SingleDelete(wo, key);
else
rdb->Delete(wo, key);
it->Next();
}
}
@@ -6290,6 +6326,16 @@ static SHOW_VAR rocksdb_status_vars[]= {
DEF_STATUS_VAR(number_superversion_cleanups),
DEF_STATUS_VAR(number_block_not_compressed),
DEF_STATUS_VAR_PTR("number_stat_computes", &rocksdb_number_stat_computes, SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
&rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
SHOW_LONGLONG),
{NullS, NullS, SHOW_LONG}
};

@@ -258,6 +258,7 @@ class ha_rocksdb: public handler

void update_auto_incr_val();
void load_auto_incr_value();
bool can_use_single_delete(uint index);

/*
Descriptor telling how to decode/encode a field to on-disk record storage
@@ -457,6 +458,10 @@ class ha_rocksdb: public handler
int write_row(uchar *buf);
int update_row(const uchar *old_data, uchar *new_data);
int delete_row(const uchar *buf);
rocksdb::Status delete_or_singledelete(uint index,
Rdb_transaction *trx,
rocksdb::ColumnFamilyHandle* cf,
const rocksdb::Slice& key);

int index_next(uchar *buf);
int index_next_with_direction(uchar *buf, bool move_forward);

0 comments on commit bd338db

Please sign in to comment.
You can’t perform that action at this time.