Skip to content

Commit 3a4d319

Browse files
committed
Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - NAPI fixes and cleanups (Pavel, Olivier) - Add support for absolute timeouts (Pavel) - Fixes for io-wq/sqpoll affinities (Felix) - Efficiency improvements for dealing with huge pages (Chenliang) - Support for a minwait mode, where the application essentially has two timouts - one smaller one that defines the batch timeout, and the overall large one similar to what we had before. This enables efficient use of batching based on count + timeout, while still working well with periods of less intensive workloads - Use ITER_UBUF for single segment sends - Add support for incremental buffer consumption. Right now each operation will always consume a full buffer. With incremental consumption, a recv/read operation only consumes the part of the buffer that it needs to satisfy the operation - Add support for GCOV for io_uring, to help retain a high coverage of test to code ratio - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly converted to a blocking retry - Add support for cloning registered buffers from one ring to another - Misc cleanups (Anuj, me) * tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits) io_uring: add IORING_REGISTER_COPY_BUFFERS method io_uring/register: provide helper to get io_ring_ctx from 'fd' io_uring/rsrc: add reference count to struct io_mapped_ubuf io_uring/rsrc: clear 'slot' entry upfront io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout ...
2 parents 69a3a0a + 7cc2a6e commit 3a4d319

File tree

20 files changed

+723
-260
lines changed

20 files changed

+723
-260
lines changed

include/linux/io_uring_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ struct io_ring_ctx {
239239
struct io_rings *rings;
240240
struct percpu_ref refs;
241241

242+
clockid_t clockid;
243+
enum tk_offsets clock_offset;
244+
242245
enum task_work_notify_mode notify_method;
243246
unsigned sq_thread_idle;
244247
} ____cacheline_aligned_in_smp;

include/uapi/linux/io_uring.h

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,11 +440,21 @@ struct io_uring_cqe {
440440
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
441441
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
442442
* them from sends.
443+
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
444+
* more completions. In other words, the buffer is being
445+
* partially consumed, and will be used by the kernel for
446+
* more completions. This is only set for buffers used via
447+
* the incremental buffer consumption, as provided by
448+
* a ring buffer setup with IOU_PBUF_RING_INC. For any
449+
* other provided buffer type, all completions with a
450+
* buffer passed back is automatically returned to the
451+
* application.
443452
*/
444453
#define IORING_CQE_F_BUFFER (1U << 0)
445454
#define IORING_CQE_F_MORE (1U << 1)
446455
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
447456
#define IORING_CQE_F_NOTIF (1U << 3)
457+
#define IORING_CQE_F_BUF_MORE (1U << 4)
448458

449459
#define IORING_CQE_BUFFER_SHIFT 16
450460

@@ -507,6 +517,7 @@ struct io_cqring_offsets {
507517
#define IORING_ENTER_SQ_WAIT (1U << 2)
508518
#define IORING_ENTER_EXT_ARG (1U << 3)
509519
#define IORING_ENTER_REGISTERED_RING (1U << 4)
520+
#define IORING_ENTER_ABS_TIMER (1U << 5)
510521

511522
/*
512523
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -542,6 +553,7 @@ struct io_uring_params {
542553
#define IORING_FEAT_LINKED_FILE (1U << 12)
543554
#define IORING_FEAT_REG_REG_RING (1U << 13)
544555
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
556+
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
545557

546558
/*
547559
* io_uring_register(2) opcodes and arguments
@@ -595,6 +607,11 @@ enum io_uring_register_op {
595607
IORING_REGISTER_NAPI = 27,
596608
IORING_UNREGISTER_NAPI = 28,
597609

610+
IORING_REGISTER_CLOCK = 29,
611+
612+
/* copy registered buffers from source ring to current ring */
613+
IORING_REGISTER_COPY_BUFFERS = 30,
614+
598615
/* this goes last */
599616
IORING_REGISTER_LAST,
600617

@@ -675,6 +692,21 @@ struct io_uring_restriction {
675692
__u32 resv2[3];
676693
};
677694

695+
struct io_uring_clock_register {
696+
__u32 clockid;
697+
__u32 __resv[3];
698+
};
699+
700+
enum {
701+
IORING_REGISTER_SRC_REGISTERED = 1,
702+
};
703+
704+
struct io_uring_copy_buffers {
705+
__u32 src_fd;
706+
__u32 flags;
707+
__u32 pad[6];
708+
};
709+
678710
struct io_uring_buf {
679711
__u64 addr;
680712
__u32 len;
@@ -707,9 +739,17 @@ struct io_uring_buf_ring {
707739
* mmap(2) with the offset set as:
708740
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
709741
* to get a virtual mapping for the ring.
742+
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
743+
* consumed incrementally. Normally one (or more) buffers
744+
* are fully consumed. With incremental consumptions, it's
745+
* feasible to register big ranges of buffers, and each
746+
* use of it will consume only as much as it needs. This
747+
* requires that both the kernel and application keep
748+
* track of where the current read/recv index is at.
710749
*/
711750
enum io_uring_register_pbuf_ring_flags {
712751
IOU_PBUF_RING_MMAP = 1,
752+
IOU_PBUF_RING_INC = 2,
713753
};
714754

715755
/* argument for IORING_(UN)REGISTER_PBUF_RING */
@@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
758798
struct io_uring_getevents_arg {
759799
__u64 sigmask;
760800
__u32 sigmask_sz;
761-
__u32 pad;
801+
__u32 min_wait_usec;
762802
__u64 ts;
763803
};
764804

init/Kconfig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,19 @@ config IO_URING
16871687
applications to submit and complete IO through submission and
16881688
completion rings that are shared between the kernel and application.
16891689

1690+
config GCOV_PROFILE_URING
1691+
bool "Enable GCOV profiling on the io_uring subsystem"
1692+
depends on GCOV_KERNEL
1693+
help
1694+
Enable GCOV profiling on the io_uring subsystem, to facilitate
1695+
code coverage testing.
1696+
1697+
If unsure, say N.
1698+
1699+
Note that this will have a negative impact on the performance of
1700+
the io_uring subsystem, hence this should only be enabled for
1701+
specific test purposes.
1702+
16901703
config ADVISE_SYSCALLS
16911704
bool "Enable madvise/fadvise syscalls" if EXPERT
16921705
default y

io_uring/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
#
33
# Makefile for io_uring
44

5+
ifdef CONFIG_GCOV_PROFILE_URING
6+
GCOV_PROFILE := y
7+
endif
8+
59
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
610
tctx.o filetable.o rw.o net.o poll.o \
711
eventfd.o uring_cmd.o openclose.o \

io_uring/eventfd.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ struct io_ev_fd {
1515
struct eventfd_ctx *cq_ev_fd;
1616
unsigned int eventfd_async: 1;
1717
struct rcu_head rcu;
18-
atomic_t refs;
18+
refcount_t refs;
1919
atomic_t ops;
2020
};
2121

@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
3737

3838
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
3939

40-
if (atomic_dec_and_test(&ev_fd->refs))
40+
if (refcount_dec_and_test(&ev_fd->refs))
4141
io_eventfd_free(rcu);
4242
}
4343

@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
6363
*/
6464
if (unlikely(!ev_fd))
6565
return;
66-
if (!atomic_inc_not_zero(&ev_fd->refs))
66+
if (!refcount_inc_not_zero(&ev_fd->refs))
6767
return;
6868
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
6969
goto out;
@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
7777
}
7878
}
7979
out:
80-
if (atomic_dec_and_test(&ev_fd->refs))
80+
if (refcount_dec_and_test(&ev_fd->refs))
8181
call_rcu(&ev_fd->rcu, io_eventfd_free);
8282
}
8383

@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
126126
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
127127
if (IS_ERR(ev_fd->cq_ev_fd)) {
128128
int ret = PTR_ERR(ev_fd->cq_ev_fd);
129+
129130
kfree(ev_fd);
130131
return ret;
131132
}
@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
136137

137138
ev_fd->eventfd_async = eventfd_async;
138139
ctx->has_evfd = true;
139-
atomic_set(&ev_fd->refs, 1);
140+
refcount_set(&ev_fd->refs, 1);
140141
atomic_set(&ev_fd->ops, 0);
141142
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
142143
return 0;
@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
151152
if (ev_fd) {
152153
ctx->has_evfd = false;
153154
rcu_assign_pointer(ctx->io_ev_fd, NULL);
154-
if (atomic_dec_and_test(&ev_fd->refs))
155+
if (refcount_dec_and_test(&ev_fd->refs))
155156
call_rcu(&ev_fd->rcu, io_eventfd_free);
156157
return 0;
157158
}

io_uring/fdinfo.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
221221
cqe->user_data, cqe->res, cqe->flags);
222222

223223
}
224-
225224
spin_unlock(&ctx->completion_lock);
225+
226+
#ifdef CONFIG_NET_RX_BUSY_POLL
227+
if (ctx->napi_enabled) {
228+
seq_puts(m, "NAPI:\tenabled\n");
229+
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
230+
if (ctx->napi_prefer_busy_poll)
231+
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
232+
else
233+
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
234+
} else {
235+
seq_puts(m, "NAPI:\tdisabled\n");
236+
}
237+
#endif
226238
}
227239
#endif

io_uring/io-wq.c

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/slab.h>
1414
#include <linux/rculist_nulls.h>
1515
#include <linux/cpu.h>
16+
#include <linux/cpuset.h>
1617
#include <linux/task_work.h>
1718
#include <linux/audit.h>
1819
#include <linux/mmu_context.h>
@@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
11671168

11681169
if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
11691170
goto err;
1170-
cpumask_copy(wq->cpu_mask, cpu_possible_mask);
1171+
cpuset_cpus_allowed(data->task, wq->cpu_mask);
11711172
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
11721173
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
11731174
task_rlimit(current, RLIMIT_NPROC);
@@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
13221323

13231324
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
13241325
{
1326+
cpumask_var_t allowed_mask;
1327+
int ret = 0;
1328+
13251329
if (!tctx || !tctx->io_wq)
13261330
return -EINVAL;
13271331

1332+
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
1333+
return -ENOMEM;
1334+
13281335
rcu_read_lock();
1329-
if (mask)
1330-
cpumask_copy(tctx->io_wq->cpu_mask, mask);
1331-
else
1332-
cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
1336+
cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
1337+
if (mask) {
1338+
if (cpumask_subset(mask, allowed_mask))
1339+
cpumask_copy(tctx->io_wq->cpu_mask, mask);
1340+
else
1341+
ret = -EINVAL;
1342+
} else {
1343+
cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
1344+
}
13331345
rcu_read_unlock();
13341346

1335-
return 0;
1347+
free_cpumask_var(allowed_mask);
1348+
return ret;
13361349
}
13371350

13381351
/*

0 commit comments

Comments
 (0)