Skip to content

Commit

Permalink
rdmaio: enable fio-native io-direction semantics
Browse files Browse the repository at this point in the history
Currently rdmaio configuration api is far from ideal.
1) It is too rdma speciffic
2) since client always use in rw=write mode so all stats accounted to write,

Let's make it more fio friendly 
 - odirect={0,1}:  represent  whenever we use zerocopy IO FIO_RDMA_MEM_XXX or 
 - rw={read,write}: represend data direction
 - listen={0,1} use same semantics as net and netsplice engines
So new configuration matrix look like follows:
if (td->o.odirect) {
       td_read(td)  ==> FIO_RDMA_MEM_READ
       td_write(td) ==> FIO_RDMA_MEM_WRITE
   } else {
       td_read(td)  ==> FIO_RDMA_CHA_RECV
       td_write(td) ==> FIO_RDMA_CHA_SEND
   }
   
This definitely looks much cleaner, and represend actual difference between modes
Also this allow to have sane read/write statistics
- remove option: verb

This breaks old configs, but make it explicit and visiable.
IMHO this is not a problem since rdmaio was completey broken for too long.
  • Loading branch information
dmonakhov committed Jun 10, 2020
1 parent 2006a8d commit 90d9edb
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 65 deletions.
22 changes: 12 additions & 10 deletions HOWTO
Expand Up @@ -1900,8 +1900,15 @@ I/O engine
**rdma**
The RDMA I/O engine supports both RDMA memory semantics
(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
InfiniBand, RoCE and iWARP protocols. This engine defines engine
specific options.
InfiniBand, RoCE and iWARP protocols. This engine defines specific
io derection transformation matrix:
With :option: `direct` enbled:
`DDIR_READ` : `IBV_WC_RDMA_READ`
`DDIR_WRITE` : `IBV_WC_RDMA_WRITE`
With options.direct disabled:
`DDIR_READ` : `IBV_WC_RECV`
`DDIR_WRITE` : `IBV_WC_SEND`
This engine defines engine specific options.

**falloc**
I/O engine that does regular fallocate to simulate data transfer as
Expand Down Expand Up @@ -2169,11 +2176,13 @@ with the caveat that when used on the command line, they must come after the
hostname if the job is a TCP listener or UDP reader. For unix sockets, the
normal :option:`filename` option should be used and the port is invalid.

.. option:: listen : [netsplice] [net]
.. option:: listen : [netsplice] [net] [rdma]

For TCP network connections, tell fio to listen for incoming connections
rather than initiating an outgoing connection. The :option:`hostname` must
be omitted if this option is used.
For RDMA engine, tell fio to listen for incoming RDMA-CM connection,
rather than initiating an outgoing connection.

.. option:: pingpong : [netsplice] [net]

Expand Down Expand Up @@ -2245,13 +2254,6 @@ with the caveat that when used on the command line, they must come after the

The size of the chunk to use for each file.

.. option:: verb=str : [rdma]

The RDMA verb to use on this side of the RDMA ioengine connection. Valid
values are write, read, send and recv. These correspond to the equivalent
RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
specified on the client side of the connection. See the examples folder.

.. option:: bindname=str : [rdma]

The name to use to bind the local RDMA-CM connection to a local RDMA device.
Expand Down
59 changes: 26 additions & 33 deletions engines/rdma.c
Expand Up @@ -57,6 +57,7 @@ enum rdma_io_mode {

struct rdmaio_options {
struct thread_data *td;
bool listen;
unsigned int port;
enum rdma_io_mode verb;
char *bindname;
Expand Down Expand Up @@ -104,33 +105,14 @@ static struct fio_option options[] = {
.group = FIO_OPT_G_RDMA,
},
{
.name = "verb",
.lname = "RDMA engine verb",
.alias = "proto",
.type = FIO_OPT_STR,
.off1 = offsetof(struct rdmaio_options, verb),
.help = "RDMA engine verb",
.def = "write",
.posval = {
{ .ival = "write",
.oval = FIO_RDMA_MEM_WRITE,
.help = "Memory Write",
},
{ .ival = "read",
.oval = FIO_RDMA_MEM_READ,
.help = "Memory Read",
},
{ .ival = "send",
.oval = FIO_RDMA_CHA_SEND,
.help = "Posted Send",
},
{ .ival = "recv",
.oval = FIO_RDMA_CHA_RECV,
.help = "Posted Receive",
},
},
.name = "listen",
.lname = "rdma engine listen",
.help = "Listen for incoming RDMA-CM connections",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct rdmaio_options, listen),
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_RDMA,
.group = FIO_OPT_G_RDMA,
},
{
.name = NULL,
Expand Down Expand Up @@ -896,6 +878,18 @@ static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
return 1;
}

if (td->o.odirect) {
if (td_read(td))
rd->rdma_protocol = FIO_RDMA_MEM_READ;
else
rd->rdma_protocol = FIO_RDMA_MEM_WRITE;
} else {
if (td_read(td))
rd->rdma_protocol = FIO_RDMA_CHA_RECV;
else
rd->rdma_protocol = FIO_RDMA_CHA_SEND;

}
/* send task request */
strncpy(rd->send_buf.name, td->o.name, FIO_RDMA_NAME_MAX - 1);
rd->send_buf.name[FIO_RDMA_NAME_MAX - 1] = '\0';
Expand Down Expand Up @@ -960,7 +954,9 @@ static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)

static int fio_rdmaio_open_file(struct thread_data *td, struct fio_file *f)
{
if (td_read(td))
struct rdmaio_options *o = td->eo;

if (o->listen)
return fio_rdmaio_accept(td, f);
else
return fio_rdmaio_connect(td, f);
Expand Down Expand Up @@ -1299,16 +1295,13 @@ static int fio_rdmaio_init(struct thread_data *td)
log_err("fio: rdma_create_id fail: %m\n");
return 1;
}

if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
(rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
if (td->o.odirect) {
rd->rmt_us =
malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
memset(rd->rmt_us, 0,
FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
rd->rmt_nr = 0;
}

rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *));
memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *));
rd->io_u_queued_nr = 0;
Expand All @@ -1321,7 +1314,7 @@ static int fio_rdmaio_init(struct thread_data *td)
memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *));
rd->io_u_completed_nr = 0;

if (td_read(td)) { /* READ as the server */
if (o->listen) { /* READ as the server */
rd->is_client = 0;
td->flags |= TD_F_NO_PROGRESS;
/* server rd->rdma_buf_len will be setup after got request */
Expand Down
24 changes: 16 additions & 8 deletions examples/gpudirect-rdmaio-client.fio
@@ -1,15 +1,23 @@
# Example gpudirect rdma client job
[global]
ioengine=rdma
hostname=[hostname]
port=[port]
verb=[read/write/send/recv]
mem=cudamalloc
gpu_dev_id=0
hostname=${RDMA_SERVER}
port=8888
bs=1m
size=100g
mem=cudamalloc
gpu_dev_id=0

[sender]
[cudadirect-client-rdma-write]
# use IBV_WC_RDMA_WRITE
rw=write
iodepth=1
iodepth_batch_complete=1
direct=1
numjobs=2
iodepth=16

[cudadirect-client-rdma-read]
# use IBV_WC_RDMA_READ
rw=read
direct=1
numjobs=2
iodepth=16
13 changes: 7 additions & 6 deletions examples/gpudirect-rdmaio-server.fio
@@ -1,12 +1,13 @@
# Example rdma server job
# Example gpudirect rdma server job
[global]
ioengine=rdma
port=[port]
mem=cudamalloc
gpu_dev_id=0
listen=1
port=8888
bs=1m
size=100g
mem=cudamalloc
gpu_dev_id=0

[receiver]
rw=read
[server]
numjobs=4
iodepth=16
17 changes: 11 additions & 6 deletions examples/rdmaio-client.fio
Expand Up @@ -3,14 +3,19 @@
ioengine=rdma
hostname=${RDMA_SERVER}
port=8888
# Data exchange method
# verb=[read/write/send/recv]
verb=write
bs=1m
size=100g
# IO direction: [read] as the server, [write] as for client
rw=write

[client-rdma-write]
numjobs=4
# use IBV_WC_RDMA_WRITE
rw=write
direct=1
numjobs=2
iodepth=16

[client-rdma-read]
# use IBV_WC_RDMA_READ
rw=read
direct=1
numjobs=2
iodepth=16
3 changes: 1 addition & 2 deletions examples/rdmaio-server.fio
@@ -1,11 +1,10 @@
# Example rdma server job
[global]
ioengine=rdma
listen=1
port=8888
bs=1m
size=100g
# IO direction: [read] as the server, [write] as for client
rw=read

[server]
numjobs=4
Expand Down

0 comments on commit 90d9edb

Please sign in to comment.