Skip to content

Commit 7ae3f6e

Browse files
npigginmpe
authored andcommitted
powerpc/watchdog: Use hrtimers for per-CPU heartbeat
Using a jiffies timer creates a dependency on the tick_do_timer_cpu incrementing jiffies. If that CPU has locked up and jiffies is not incrementing, the watchdog heartbeat timer for all CPUs stops and creates false positives and confusing warnings on local CPUs, and also causes the SMP detector to stop, so the root cause is never detected. Fix this by using hrtimer based timers for the watchdog heartbeat, like the generic kernel hardlockup detector. Cc: Gautham R. Shenoy <ego@linux.vnet.ibm.com> Reported-by: Ravikumar Bangoria <ravi.bangoria@in.ibm.com> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com> Reported-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent b2d3b5e commit 7ae3f6e

File tree

1 file changed

+40
-41
lines changed

1 file changed

+40
-41
lines changed

arch/powerpc/kernel/watchdog.c

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
7777

7878
static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */
7979

80-
static DEFINE_PER_CPU(struct timer_list, wd_timer);
80+
static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
8181
static DEFINE_PER_CPU(u64, wd_timer_tb);
8282

8383
/* SMP checker bits */
@@ -293,21 +293,21 @@ void soft_nmi_interrupt(struct pt_regs *regs)
293293
nmi_exit();
294294
}
295295

296-
static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
297-
{
298-
t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms);
299-
if (wd_timer_period_ms > 1000)
300-
t->expires = __round_jiffies_up(t->expires, cpu);
301-
add_timer_on(t, cpu);
302-
}
303-
304-
static void wd_timer_fn(struct timer_list *t)
296+
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
305297
{
306298
int cpu = smp_processor_id();
307299

300+
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
301+
return HRTIMER_NORESTART;
302+
303+
if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
304+
return HRTIMER_NORESTART;
305+
308306
watchdog_timer_interrupt(cpu);
309307

310-
wd_timer_reset(cpu, t);
308+
hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
309+
310+
return HRTIMER_RESTART;
311311
}
312312

313313
void arch_touch_nmi_watchdog(void)
@@ -323,37 +323,22 @@ void arch_touch_nmi_watchdog(void)
323323
}
324324
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
325325

326-
static void start_watchdog_timer_on(unsigned int cpu)
327-
{
328-
struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
329-
330-
per_cpu(wd_timer_tb, cpu) = get_tb();
331-
332-
timer_setup(t, wd_timer_fn, TIMER_PINNED);
333-
wd_timer_reset(cpu, t);
334-
}
335-
336-
static void stop_watchdog_timer_on(unsigned int cpu)
337-
{
338-
struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
339-
340-
del_timer_sync(t);
341-
}
342-
343-
static int start_wd_on_cpu(unsigned int cpu)
326+
static void start_watchdog(void *arg)
344327
{
328+
struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
329+
int cpu = smp_processor_id();
345330
unsigned long flags;
346331

347332
if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
348333
WARN_ON(1);
349-
return 0;
334+
return;
350335
}
351336

352337
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
353-
return 0;
338+
return;
354339

355340
if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
356-
return 0;
341+
return;
357342

358343
wd_smp_lock(&flags);
359344
cpumask_set_cpu(cpu, &wd_cpus_enabled);
@@ -363,27 +348,40 @@ static int start_wd_on_cpu(unsigned int cpu)
363348
}
364349
wd_smp_unlock(&flags);
365350

366-
start_watchdog_timer_on(cpu);
351+
*this_cpu_ptr(&wd_timer_tb) = get_tb();
367352

368-
return 0;
353+
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
354+
hrtimer->function = watchdog_timer_fn;
355+
hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
356+
HRTIMER_MODE_REL_PINNED);
369357
}
370358

371-
static int stop_wd_on_cpu(unsigned int cpu)
359+
static int start_watchdog_on_cpu(unsigned int cpu)
372360
{
361+
return smp_call_function_single(cpu, start_watchdog, NULL, true);
362+
}
363+
364+
static void stop_watchdog(void *arg)
365+
{
366+
struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
367+
int cpu = smp_processor_id();
373368
unsigned long flags;
374369

375370
if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
376-
return 0; /* Can happen in CPU unplug case */
371+
return; /* Can happen in CPU unplug case */
377372

378-
stop_watchdog_timer_on(cpu);
373+
hrtimer_cancel(hrtimer);
379374

380375
wd_smp_lock(&flags);
381376
cpumask_clear_cpu(cpu, &wd_cpus_enabled);
382377
wd_smp_unlock(&flags);
383378

384379
wd_smp_clear_cpu_pending(cpu, get_tb());
380+
}
385381

386-
return 0;
382+
static int stop_watchdog_on_cpu(unsigned int cpu)
383+
{
384+
return smp_call_function_single(cpu, stop_watchdog, NULL, true);
387385
}
388386

389387
static void watchdog_calc_timeouts(void)
@@ -402,7 +400,7 @@ void watchdog_nmi_stop(void)
402400
int cpu;
403401

404402
for_each_cpu(cpu, &wd_cpus_enabled)
405-
stop_wd_on_cpu(cpu);
403+
stop_watchdog_on_cpu(cpu);
406404
}
407405

408406
void watchdog_nmi_start(void)
@@ -411,7 +409,7 @@ void watchdog_nmi_start(void)
411409

412410
watchdog_calc_timeouts();
413411
for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
414-
start_wd_on_cpu(cpu);
412+
start_watchdog_on_cpu(cpu);
415413
}
416414

417415
/*
@@ -423,7 +421,8 @@ int __init watchdog_nmi_probe(void)
423421

424422
err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
425423
"powerpc/watchdog:online",
426-
start_wd_on_cpu, stop_wd_on_cpu);
424+
start_watchdog_on_cpu,
425+
stop_watchdog_on_cpu);
427426
if (err < 0) {
428427
pr_warn("could not be initialized");
429428
return err;

0 commit comments

Comments
 (0)