Skip to content

Commit

Permalink
librbd: add 'write_zeroes' public C/C++ API methods
Browse files Browse the repository at this point in the history
Unlike the existing 'discard' option which is more of a hint to
attempt to release space, the new 'write_zeroes' APIs will ensure
that the entire provided extent is fully zeroed.

Signed-off-by: Jason Dillaman <dillaman@redhat.com>
(cherry picked from commit ae6dd86)

Conflicts:
	src/librbd/api/Io.h/cc: logic exists in ImageRequestWQ files
	src/librbd/librbd.cc: trivial resolution due missing api::Io
	src/test/librbd/test_librbd.cc: trivial resolution
  • Loading branch information
Jason Dillaman committed Jul 22, 2020
1 parent 3a03673 commit 0e3add9
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 11 deletions.
19 changes: 15 additions & 4 deletions src/include/rbd/librbd.h
Expand Up @@ -47,6 +47,7 @@ extern "C" {
#define LIBRBD_SUPPORTS_IOVEC 1
#define LIBRBD_SUPPORTS_WATCH 0
#define LIBRBD_SUPPORTS_WRITESAME 1
#define LIBRBD_SUPPORTS_WRITE_ZEROES 1

#if __GNUC__ >= 4
#define CEPH_RBD_API __attribute__ ((visibility ("default")))
Expand Down Expand Up @@ -1094,10 +1095,15 @@ CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
const char *buf, int op_flags);
CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
const char *buf, size_t data_len, int op_flags);
const char *buf, size_t data_len,
int op_flags);
CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
uint64_t len, int zero_flags,
int op_flags);
CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
size_t len, const char *cmp_buf,
const char *buf, uint64_t *mismatch_off,
const char *buf,
uint64_t *mismatch_off,
int op_flags);

CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
Expand Down Expand Up @@ -1125,10 +1131,15 @@ CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
const char *buf, size_t data_len,
rbd_completion_t c, int op_flags);
CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
size_t len, rbd_completion_t c,
int zero_flags, int op_flags);
CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
uint64_t off, size_t len,
const char *cmp_buf, const char *buf,
rbd_completion_t c, uint64_t *mismatch_off,
const char *cmp_buf,
const char *buf,
rbd_completion_t c,
uint64_t *mismatch_off,
int op_flags);

CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
Expand Down
10 changes: 9 additions & 1 deletion src/include/rbd/librbd.hpp
Expand Up @@ -664,20 +664,29 @@ class CEPH_RBD_API Image
ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);

int discard(uint64_t ofs, uint64_t len);
ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);

ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);

int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);

int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);
int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
int zero_flags, int op_flags);

int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
ceph::bufferlist& bl, RBD::AioCompletion *c,
uint64_t *mismatch_off, int op_flags);

/**
* read async from image
*
Expand All @@ -699,7 +708,6 @@ class CEPH_RBD_API Image
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);
int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);

int flush();
/**
Expand Down
82 changes: 82 additions & 0 deletions src/librbd/io/ImageRequestWQ.cc
Expand Up @@ -212,6 +212,32 @@ ssize_t ImageRequestWQ<I>::writesame(uint64_t off, uint64_t len,
return len;
}

template <typename I>
ssize_t ImageRequestWQ<I>::write_zeroes(uint64_t off, uint64_t len,
int zero_flags, int op_flags) {
auto cct = m_image_ctx.cct;
ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
<< "len = " << len << dendl;

m_image_ctx.image_lock.lock_shared();
int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
m_image_ctx.image_lock.unlock_shared();
if (r < 0) {
lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
return r;
}

C_SaferCond ctx;
auto aio_comp = io::AioCompletion::create(&ctx);
aio_write_zeroes(aio_comp, off, len, zero_flags, op_flags, false);

r = ctx.wait();
if (r < 0) {
return r;
}
return len;
}

template <typename I>
ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len,
bufferlist &&cmp_bl,
Expand Down Expand Up @@ -486,6 +512,62 @@ void ImageRequestWQ<I>::aio_writesame(AioCompletion *c, uint64_t off,
trace.event("finish");
}


template <typename I>
void ImageRequestWQ<I>::aio_write_zeroes(io::AioCompletion *aio_comp,
uint64_t off, uint64_t len,
int zero_flags, int op_flags,
bool native_async) {
auto cct = m_image_ctx.cct;
FUNCTRACE(cct);
ZTracer::Trace trace;
if (m_image_ctx.blkin_trace_all) {
trace.init("io: write_zeroes", &m_image_ctx.trace_endpoint);
trace.event("init");
}

aio_comp->init_time(util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_DISCARD);
ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
<< "completion=" << aio_comp << ", off=" << off << ", "
<< "len=" << len << dendl;

if (native_async && m_image_ctx.event_socket.is_valid()) {
aio_comp->set_event_notify(true);
}

// validate the supported flags
if (zero_flags != 0U) {
aio_comp->fail(-EINVAL);
return;
}

if (!start_in_flight_io(aio_comp)) {
return;
}

// enable partial discard (zeroing) of objects
uint32_t discard_granularity_bytes = 0;

auto tid = ++m_last_tid;

{
std::lock_guard locker{m_lock};
m_queued_or_blocked_io_tids.insert(tid);
}

auto req = ImageDispatchSpec<I>::create_discard_request(
m_image_ctx, aio_comp, off, len, discard_granularity_bytes, trace, tid);

std::shared_lock owner_locker{m_image_ctx.owner_lock};
if (m_image_ctx.non_blocking_aio || writes_blocked()) {
queue(req);
} else {
process_io(req, false);
finish_in_flight_io();
}
trace.event("finish");
}

template <typename I>
void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
uint64_t off, uint64_t len,
Expand Down
4 changes: 4 additions & 0 deletions src/librbd/io/ImageRequestWQ.h
Expand Up @@ -38,6 +38,8 @@ class ImageRequestWQ
ssize_t discard(uint64_t off, uint64_t len,
uint32_t discard_granularity_bytes);
ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
ssize_t write_zeroes(uint64_t off, uint64_t len, int zero_flags,
int op_flags);
ssize_t compare_and_write(uint64_t off, uint64_t len,
bufferlist &&cmp_bl, bufferlist &&bl,
uint64_t *mismatch_off, int op_flags);
Expand All @@ -52,6 +54,8 @@ class ImageRequestWQ
void aio_flush(AioCompletion *c, bool native_async=true);
void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
bufferlist &&bl, int op_flags, bool native_async=true);
void aio_write_zeroes(AioCompletion *c, uint64_t off, uint64_t len,
int zero_flags, int op_flags, bool native_async);
void aio_compare_and_write(AioCompletion *c, uint64_t off,
uint64_t len, bufferlist &&cmp_bl,
bufferlist &&bl, uint64_t *mismatch_off,
Expand Down
49 changes: 43 additions & 6 deletions src/librbd/librbd.cc
Expand Up @@ -2540,8 +2540,8 @@ namespace librbd {
}

bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) {
int r = ictx->io_work_queue->discard(ofs, len, 0);
if (discard_zero && bl.is_zero()) {
int r = ictx->io_work_queue->write_zeroes(ofs, len, 0U, op_flags);
tracepoint(librbd, writesame_exit, r);
return r;
}
Expand All @@ -2551,6 +2551,13 @@ namespace librbd {
return r;
}

ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags,
int op_flags)
{
ImageCtx *ictx = (ImageCtx *)ctx;
return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
}

ssize_t Image::compare_and_write(uint64_t ofs, size_t len,
ceph::bufferlist &cmp_bl, ceph::bufferlist& bl,
uint64_t *mismatch_off, int op_flags)
Expand Down Expand Up @@ -2678,8 +2685,9 @@ namespace librbd {
}

bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) {
ictx->io_work_queue->aio_discard(get_aio_completion(c), off, len, 0);
if (discard_zero && bl.is_zero()) {
ictx->io_work_queue->aio_write_zeroes(get_aio_completion(c), off, len, 0U,
op_flags, true);
tracepoint(librbd, aio_writesame_exit, 0);
return 0;
}
Expand All @@ -2690,6 +2698,15 @@ namespace librbd {
return 0;
}

int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c,
int zero_flags, int op_flags)
{
ImageCtx *ictx = (ImageCtx *)ctx;
ictx->io_work_queue->aio_write_zeroes(
get_aio_completion(c), off, len, zero_flags, op_flags, true);
return 0;
}

int Image::aio_compare_and_write(uint64_t off, size_t len,
ceph::bufferlist& cmp_bl, ceph::bufferlist& bl,
RBD::AioCompletion *c, uint64_t *mismatch_off,
Expand Down Expand Up @@ -5861,7 +5878,7 @@ extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,

bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
if (discard_zero && mem_is_zero(buf, data_len)) {
int r = ictx->io_work_queue->discard(ofs, len, 0);
int r = ictx->io_work_queue->write_zeroes(ofs, len, 0, op_flags);
tracepoint(librbd, writesame_exit, r);
return r;
}
Expand All @@ -5873,6 +5890,13 @@ extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
return r;
}

extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len,
int zero_flags, int op_flags)
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
}

extern "C" ssize_t rbd_compare_and_write(rbd_image_t image,
uint64_t ofs, size_t len,
const char *cmp_buf,
Expand Down Expand Up @@ -6085,7 +6109,8 @@ extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,

bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
if (discard_zero && mem_is_zero(buf, data_len)) {
ictx->io_work_queue->aio_discard(get_aio_completion(comp), off, len, 0);
ictx->io_work_queue->aio_write_zeroes(get_aio_completion(comp), off, len, 0,
op_flags, true);
tracepoint(librbd, aio_writesame_exit, 0);
return 0;
}
Expand All @@ -6099,6 +6124,18 @@ extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
return 0;
}

extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len,
rbd_completion_t c, int zero_flags,
int op_flags)
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;

ictx->io_work_queue->aio_write_zeroes(
get_aio_completion(comp), off, len, zero_flags, op_flags, true);
return 0;
}

extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off,
size_t len, const char *cmp_buf,
const char *buf, rbd_completion_t c,
Expand Down
59 changes: 59 additions & 0 deletions src/test/librbd/test_librbd.cc
Expand Up @@ -8175,6 +8175,65 @@ TEST_F(TestLibRBD, SnapRemoveWithChildMissing)
rados_ioctx_destroy(ioctx1);
}

TEST_F(TestLibRBD, WriteZeroes) {
librbd::RBD rbd;
librados::IoCtx ioctx;
ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
std::string name = get_temp_image_name();
int order = 0;
uint64_t size = 2 << 20;
ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));

librbd::Image image;
ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));

// 1s from [0, 256) / length 256
char data[256];
memset(data, 1, sizeof(data));
bufferlist bl;
bl.append(data, 256);
ASSERT_EQ(256, image.write(0, 256, bl));

interval_set<uint64_t> diff;
ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
iterate_cb, (void *)&diff));
auto expected_diff = interval_set<uint64_t>{{{0, 256}}};
ASSERT_EQ(expected_diff, diff);

// writes zero passed the current end extents.
// Now 1s from [0, 192) / length 192
ASSERT_EQ(size - 192,
image.write_zeroes(192, size - 192, 0U, 0));
diff.clear();
ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
iterate_cb, (void *)&diff));
expected_diff = interval_set<uint64_t>{{{0, 192}}};
ASSERT_EQ(expected_diff, diff);

// zero an existing extent and truncate some off the end
// Now 1s from [64, 192) / length 192
ASSERT_EQ(64, image.write_zeroes(0, 64, 0U, 0));

diff.clear();
ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
iterate_cb, (void *)&diff));
expected_diff = interval_set<uint64_t>{{{0, 192}}};
ASSERT_EQ(expected_diff, diff);

bufferlist expected_bl;
expected_bl.append_zero(64);
bufferlist sub_bl;
sub_bl.substr_of(bl, 0, 128);
expected_bl.claim_append(sub_bl);
expected_bl.append_zero(size - 192);

bufferlist read_bl;
EXPECT_EQ(size, image.read(0, size, read_bl));
EXPECT_EQ(expected_bl, read_bl);

ASSERT_EQ(0, image.close());
}

// poorman's ceph_assert()
namespace ceph {
void __ceph_assert_fail(const char *assertion, const char *file, int line,
Expand Down

0 comments on commit 0e3add9

Please sign in to comment.