Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
rdmaio: enable fio-native io-direction semantics
Currently rdmaio configuration api is far from ideal.
1) It is too rdma speciffic
2) since client always use in rw=write mode so all stats accounted to write,

Let's make it more fio friendly 
 - odirect={0,1}:  represent  whenever we use zerocopy IO FIO_RDMA_MEM_XXX or 
 - rw={read,write}: represend data direction
 - listen={0,1} use same semantics as net and netsplice engines
So new configuration matrix look like follows:
if (td->o.odirect) {
       td_read(td)  ==> FIO_RDMA_MEM_READ
       td_write(td) ==> FIO_RDMA_MEM_WRITE
   } else {
       td_read(td)  ==> FIO_RDMA_CHA_RECV
       td_write(td) ==> FIO_RDMA_CHA_SEND
   }
   
This definitely looks much cleaner, and represend actual difference between modes
Also this allow to have sane read/write statistics
- remove option: verb

This breaks old configs, but make it explicit and visiable.
IMHO this is not a problem since rdmaio was completey broken for too long.
  • Loading branch information
dmonakhov committed Jun 10, 2020
1 parent 2006a8d commit 90d9edb
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 65 deletions.
22 changes: 12 additions & 10 deletions HOWTO
Expand Up @@ -1900,8 +1900,15 @@ I/O engine
**rdma**
The RDMA I/O engine supports both RDMA memory semantics
(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
InfiniBand, RoCE and iWARP protocols. This engine defines engine
specific options.
InfiniBand, RoCE and iWARP protocols. This engine defines specific
io derection transformation matrix:
With :option: `direct` enbled:
`DDIR_READ` : `IBV_WC_RDMA_READ`
`DDIR_WRITE` : `IBV_WC_RDMA_WRITE`
With options.direct disabled:
`DDIR_READ` : `IBV_WC_RECV`
`DDIR_WRITE` : `IBV_WC_SEND`
This engine defines engine specific options.

**falloc**
I/O engine that does regular fallocate to simulate data transfer as
Expand Down Expand Up @@ -2169,11 +2176,13 @@ with the caveat that when used on the command line, they must come after the
hostname if the job is a TCP listener or UDP reader. For unix sockets, the
normal :option:`filename` option should be used and the port is invalid.

.. option:: listen : [netsplice] [net]
.. option:: listen : [netsplice] [net] [rdma]

For TCP network connections, tell fio to listen for incoming connections
rather than initiating an outgoing connection. The :option:`hostname` must
be omitted if this option is used.
For RDMA engine, tell fio to listen for incoming RDMA-CM connection,
rather than initiating an outgoing connection.

.. option:: pingpong : [netsplice] [net]

Expand Down Expand Up @@ -2245,13 +2254,6 @@ with the caveat that when used on the command line, they must come after the

The size of the chunk to use for each file.

.. option:: verb=str : [rdma]

The RDMA verb to use on this side of the RDMA ioengine connection. Valid
values are write, read, send and recv. These correspond to the equivalent
RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
specified on the client side of the connection. See the examples folder.

.. option:: bindname=str : [rdma]

The name to use to bind the local RDMA-CM connection to a local RDMA device.
Expand Down
59 changes: 26 additions & 33 deletions engines/rdma.c
Expand Up @@ -57,6 +57,7 @@ enum rdma_io_mode {

struct rdmaio_options {
struct thread_data *td;
bool listen;
unsigned int port;
enum rdma_io_mode verb;
char *bindname;
Expand Down Expand Up @@ -104,33 +105,14 @@ static struct fio_option options[] = {
.group = FIO_OPT_G_RDMA,
},
{
.name = "verb",
.lname = "RDMA engine verb",
.alias = "proto",
.type = FIO_OPT_STR,
.off1 = offsetof(struct rdmaio_options, verb),
.help = "RDMA engine verb",
.def = "write",
.posval = {
{ .ival = "write",
.oval = FIO_RDMA_MEM_WRITE,
.help = "Memory Write",
},
{ .ival = "read",
.oval = FIO_RDMA_MEM_READ,
.help = "Memory Read",
},
{ .ival = "send",
.oval = FIO_RDMA_CHA_SEND,
.help = "Posted Send",
},
{ .ival = "recv",
.oval = FIO_RDMA_CHA_RECV,
.help = "Posted Receive",
},
},
.name = "listen",
.lname = "rdma engine listen",
.help = "Listen for incoming RDMA-CM connections",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct rdmaio_options, listen),
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_RDMA,
.group = FIO_OPT_G_RDMA,
},
{
.name = NULL,
Expand Down Expand Up @@ -896,6 +878,18 @@ static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
return 1;
}

if (td->o.odirect) {
if (td_read(td))
rd->rdma_protocol = FIO_RDMA_MEM_READ;
else
rd->rdma_protocol = FIO_RDMA_MEM_WRITE;
} else {
if (td_read(td))
rd->rdma_protocol = FIO_RDMA_CHA_RECV;
else
rd->rdma_protocol = FIO_RDMA_CHA_SEND;

}
/* send task request */
strncpy(rd->send_buf.name, td->o.name, FIO_RDMA_NAME_MAX - 1);
rd->send_buf.name[FIO_RDMA_NAME_MAX - 1] = '\0';
Expand Down Expand Up @@ -960,7 +954,9 @@ static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)

static int fio_rdmaio_open_file(struct thread_data *td, struct fio_file *f)
{
if (td_read(td))
struct rdmaio_options *o = td->eo;

if (o->listen)
return fio_rdmaio_accept(td, f);
else
return fio_rdmaio_connect(td, f);
Expand Down Expand Up @@ -1299,16 +1295,13 @@ static int fio_rdmaio_init(struct thread_data *td)
log_err("fio: rdma_create_id fail: %m\n");
return 1;
}

if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
(rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
if (td->o.odirect) {
rd->rmt_us =
malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
memset(rd->rmt_us, 0,
FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
rd->rmt_nr = 0;
}

rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *));
memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *));
rd->io_u_queued_nr = 0;
Expand All @@ -1321,7 +1314,7 @@ static int fio_rdmaio_init(struct thread_data *td)
memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *));
rd->io_u_completed_nr = 0;

if (td_read(td)) { /* READ as the server */
if (o->listen) { /* READ as the server */
rd->is_client = 0;
td->flags |= TD_F_NO_PROGRESS;
/* server rd->rdma_buf_len will be setup after got request */
Expand Down
24 changes: 16 additions & 8 deletions examples/gpudirect-rdmaio-client.fio
@@ -1,15 +1,23 @@
# Example gpudirect rdma client job
[global]
ioengine=rdma
hostname=[hostname]
port=[port]
verb=[read/write/send/recv]
mem=cudamalloc
gpu_dev_id=0
hostname=${RDMA_SERVER}
port=8888
bs=1m
size=100g
mem=cudamalloc
gpu_dev_id=0

[sender]
[cudadirect-client-rdma-write]
# use IBV_WC_RDMA_WRITE
rw=write
iodepth=1
iodepth_batch_complete=1
direct=1
numjobs=2
iodepth=16

[cudadirect-client-rdma-read]
# use IBV_WC_RDMA_READ
rw=read
direct=1
numjobs=2
iodepth=16
13 changes: 7 additions & 6 deletions examples/gpudirect-rdmaio-server.fio
@@ -1,12 +1,13 @@
# Example rdma server job
# Example gpudirect rdma server job
[global]
ioengine=rdma
port=[port]
mem=cudamalloc
gpu_dev_id=0
listen=1
port=8888
bs=1m
size=100g
mem=cudamalloc
gpu_dev_id=0

[receiver]
rw=read
[server]
numjobs=4
iodepth=16
17 changes: 11 additions & 6 deletions examples/rdmaio-client.fio
Expand Up @@ -3,14 +3,19 @@
ioengine=rdma
hostname=${RDMA_SERVER}
port=8888
# Data exchange method
# verb=[read/write/send/recv]
verb=write
bs=1m
size=100g
# IO direction: [read] as the server, [write] as for client
rw=write

[client-rdma-write]
numjobs=4
# use IBV_WC_RDMA_WRITE
rw=write
direct=1
numjobs=2
iodepth=16

[client-rdma-read]
# use IBV_WC_RDMA_READ
rw=read
direct=1
numjobs=2
iodepth=16
3 changes: 1 addition & 2 deletions examples/rdmaio-server.fio
@@ -1,11 +1,10 @@
# Example rdma server job
[global]
ioengine=rdma
listen=1
port=8888
bs=1m
size=100g
# IO direction: [read] as the server, [write] as for client
rw=read

[server]
numjobs=4
Expand Down

0 comments on commit 90d9edb

Please sign in to comment.