Skip to content

Commit f58fc75

Browse files
committed
Merge: Sched/psi: updates to v6.3-rc1
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/2325 JIRA: https://issues.redhat.com/browse/RHEL-311 Tested: Enabled PSI and ran various stress tests. Updates and bug fixes for the PSI subsystem. This brings the code up to about v6.3-rc1. It does not include the runtime enablement interface (34f26a1 "sched/psi: Per-cgroup PSI accounting disable/re-enable interfaceas") that required a larger set of cgroup and kernfs patches. That may be take later if the prerequisites are provided. Signed-off-by: Phil Auld <pauld@redhat.com> Approved-by: Waiman Long <longman@redhat.com> Approved-by: Rafael Aquini <aquini@redhat.com> Approved-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> Approved-by: Jerry Snitselaar <jsnitsel@redhat.com> Signed-off-by: Jan Stancek <jstancek@redhat.com>
2 parents 0a30299 + 4712fca commit f58fc75

File tree

11 files changed

+378
-142
lines changed

11 files changed

+378
-142
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,12 @@ All cgroup core files are prefixed with "cgroup."
968968
killing cgroups is a process directed operation, i.e. it affects
969969
the whole thread-group.
970970

971+
irq.pressure
972+
A read-write nested-keyed file.
973+
974+
Shows pressure stall information for IRQ/SOFTIRQ. See
975+
:ref:`Documentation/accounting/psi.rst <psi>` for details.
976+
971977
Controllers
972978
===========
973979

include/linux/cgroup.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -673,11 +673,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
673673
pr_cont_kernfs_path(cgrp->kn);
674674
}
675675

676-
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
677-
{
678-
return cgrp->psi;
679-
}
680-
681676
bool cgroup_psi_enabled(void);
682677

683678
static inline void cgroup_init_kthreadd(void)

include/linux/psi.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <linux/psi_types.h>
66
#include <linux/sched.h>
77
#include <linux/poll.h>
8+
#include <linux/cgroup-defs.h>
9+
#include <linux/cgroup.h>
810

911
struct seq_file;
1012
struct css_set;
@@ -16,10 +18,6 @@ extern struct psi_group psi_system;
1618

1719
void psi_init(void);
1820

19-
void psi_task_change(struct task_struct *task, int clear, int set);
20-
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
21-
bool sleep);
22-
2321
void psi_memstall_enter(unsigned long *flags);
2422
void psi_memstall_leave(unsigned long *flags);
2523

@@ -32,6 +30,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
3230
poll_table *wait);
3331

3432
#ifdef CONFIG_CGROUPS
33+
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
34+
{
35+
return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
36+
}
37+
3538
int psi_cgroup_alloc(struct cgroup *cgrp);
3639
void psi_cgroup_free(struct cgroup *cgrp);
3740
void cgroup_move_task(struct task_struct *p, struct css_set *to);

include/linux/psi_types.h

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,36 @@ enum psi_task_count {
1515
NR_MEMSTALL,
1616
NR_RUNNING,
1717
/*
18-
* This can't have values other than 0 or 1 and could be
19-
* implemented as a bit flag. But for now we still have room
20-
* in the first cacheline of psi_group_cpu, and this way we
21-
* don't have to special case any state tracking for it.
18+
* For IO and CPU stalls the presence of running/oncpu tasks
19+
* in the domain means a partial rather than a full stall.
20+
* For memory it's not so simple because of page reclaimers:
21+
* they are running/oncpu while representing a stall. To tell
22+
* whether a domain has productivity left or not, we need to
23+
* distinguish between regular running (i.e. productive)
24+
* threads and memstall ones.
2225
*/
23-
NR_ONCPU,
26+
NR_MEMSTALL_RUNNING,
2427
NR_PSI_TASK_COUNTS = 4,
2528
};
2629

2730
/* Task state bitmasks */
2831
#define TSK_IOWAIT (1 << NR_IOWAIT)
2932
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
3033
#define TSK_RUNNING (1 << NR_RUNNING)
31-
#define TSK_ONCPU (1 << NR_ONCPU)
34+
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
35+
36+
/* Only one task can be scheduled, no corresponding task count */
37+
#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
3238

3339
/* Resources that workloads could be stalled on */
3440
enum psi_res {
3541
PSI_IO,
3642
PSI_MEM,
3743
PSI_CPU,
38-
NR_PSI_RESOURCES = 3,
44+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
45+
PSI_IRQ,
46+
#endif
47+
NR_PSI_RESOURCES,
3948
};
4049

4150
/*
@@ -51,11 +60,20 @@ enum psi_states {
5160
PSI_MEM_FULL,
5261
PSI_CPU_SOME,
5362
PSI_CPU_FULL,
63+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
64+
PSI_IRQ_FULL,
65+
#endif
5466
/* Only per-CPU, to weigh the CPU in the global average: */
5567
PSI_NONIDLE,
56-
NR_PSI_STATES = 7,
68+
NR_PSI_STATES,
5769
};
5870

71+
/* Use one bit in the state mask to track TSK_ONCPU */
72+
#define PSI_ONCPU (1 << NR_PSI_STATES)
73+
74+
/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
75+
#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1))
76+
5977
enum psi_aggregators {
6078
PSI_AVGS = 0,
6179
PSI_POLL,
@@ -135,6 +153,8 @@ struct psi_trigger {
135153
};
136154

137155
struct psi_group {
156+
struct psi_group *parent;
157+
138158
/* Protects data used by the aggregator */
139159
struct mutex avgs_lock;
140160

@@ -158,6 +178,7 @@ struct psi_group {
158178
struct timer_list poll_timer;
159179
wait_queue_head_t poll_wait;
160180
atomic_t poll_wakeup;
181+
atomic_t poll_scheduled;
161182

162183
/* Protects data used by the monitor */
163184
struct mutex trigger_lock;

include/linux/sched.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -902,9 +902,6 @@ struct task_struct {
902902
unsigned sched_reset_on_fork:1;
903903
unsigned sched_contributes_to_load:1;
904904
unsigned sched_migrated:1;
905-
#ifdef CONFIG_PSI
906-
unsigned sched_psi_wake_requeue:1;
907-
#endif
908905

909906
/* Force alignment to the next boundary: */
910907
unsigned :0;

include/linux/wait.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
217217
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
218218
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
219219
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
220+
void __wake_up_pollfree(struct wait_queue_head *wq_head);
220221

221222
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
222223
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
245246
#define wake_up_interruptible_sync_poll_locked(x, m) \
246247
__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
247248

249+
/**
250+
* wake_up_pollfree - signal that a polled waitqueue is going away
251+
* @wq_head: the wait queue head
252+
*
253+
* In the very rare cases where a ->poll() implementation uses a waitqueue whose
254+
* lifetime is tied to a task rather than to the 'struct file' being polled,
255+
* this function must be called before the waitqueue is freed so that
256+
* non-blocking polls (e.g. epoll) are notified that the queue is going away.
257+
*
258+
* The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
259+
* an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
260+
*/
261+
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
262+
{
263+
/*
264+
* For performance reasons, we don't always take the queue lock here.
265+
* Therefore, we might race with someone removing the last entry from
266+
* the queue, and proceed while they still hold the queue lock.
267+
* However, rcu_read_lock() is required to be held in such cases, so we
268+
* can safely proceed with an RCU-delayed free.
269+
*/
270+
if (waitqueue_active(wq_head))
271+
__wake_up_pollfree(wq_head);
272+
}
273+
248274
#define ___wait_cond_timeout(condition) \
249275
({ \
250276
bool __cond = (condition); \

kernel/cgroup/cgroup.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3683,21 +3683,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
36833683
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
36843684
{
36853685
struct cgroup *cgrp = seq_css(seq)->cgroup;
3686-
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3686+
struct psi_group *psi = cgroup_psi(cgrp);
36873687

36883688
return psi_show(seq, psi, PSI_IO);
36893689
}
36903690
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
36913691
{
36923692
struct cgroup *cgrp = seq_css(seq)->cgroup;
3693-
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3693+
struct psi_group *psi = cgroup_psi(cgrp);
36943694

36953695
return psi_show(seq, psi, PSI_MEM);
36963696
}
36973697
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
36983698
{
36993699
struct cgroup *cgrp = seq_css(seq)->cgroup;
3700-
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3700+
struct psi_group *psi = cgroup_psi(cgrp);
37013701

37023702
return psi_show(seq, psi, PSI_CPU);
37033703
}
@@ -3723,7 +3723,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
37233723
return -EBUSY;
37243724
}
37253725

3726-
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3726+
psi = cgroup_psi(cgrp);
37273727
new = psi_trigger_create(psi, buf, res);
37283728
if (IS_ERR(new)) {
37293729
cgroup_put(cgrp);
@@ -3757,6 +3757,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
37573757
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
37583758
}
37593759

3760+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3761+
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
3762+
{
3763+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3764+
struct psi_group *psi = cgroup_psi(cgrp);
3765+
3766+
return psi_show(seq, psi, PSI_IRQ);
3767+
}
3768+
3769+
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
3770+
char *buf, size_t nbytes,
3771+
loff_t off)
3772+
{
3773+
return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
3774+
}
3775+
#endif
3776+
37603777
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
37613778
poll_table *pt)
37623779
{
@@ -3774,6 +3791,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
37743791

37753792
bool cgroup_psi_enabled(void)
37763793
{
3794+
if (static_branch_likely(&psi_disabled))
3795+
return false;
3796+
37773797
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
37783798
}
37793799

@@ -5150,6 +5170,16 @@ static struct cftype cgroup_base_files[] = {
51505170
.poll = cgroup_pressure_poll,
51515171
.release = cgroup_pressure_release,
51525172
},
5173+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
5174+
{
5175+
.name = "irq.pressure",
5176+
.flags = CFTYPE_PRESSURE,
5177+
.seq_show = cgroup_irq_pressure_show,
5178+
.write = cgroup_irq_pressure_write,
5179+
.poll = cgroup_pressure_poll,
5180+
.release = cgroup_pressure_release,
5181+
},
5182+
#endif
51535183
#endif /* CONFIG_PSI */
51545184
{ } /* terminate */
51555185
};

kernel/sched/core.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
709709

710710
rq->prev_irq_time += irq_delta;
711711
delta -= irq_delta;
712+
psi_account_irqtime(rq->curr, irq_delta);
712713
#endif
713714
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
714715
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -2100,7 +2101,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
21002101

21012102
if (!(flags & ENQUEUE_RESTORE)) {
21022103
sched_info_enqueue(rq, p);
2103-
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
2104+
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
21042105
}
21052106

21062107
uclamp_rq_inc(rq, p);

0 commit comments

Comments
 (0)