Skip to content

Commit

Permalink
Check GPU thread time in determining GPU hangs in Windows GPU watchdo…
Browse files Browse the repository at this point in the history
…g V2

The GPU main thread may be de-scheduled and does not have enough time
to finish the task. This CL count the total thread time the GPU main
thread spent doing the job when determining whether the GPU thread
hangs or not. The watchdog allows four timeouts if the GPU main thread
doesn't get enough time. After that, the GPU process will be
terminated.

Thread executing time of an arbitrary thread is only available on Windows.
Therefore, this mechanism is not enabled on the other platforms.

Bug: 949839

Change-Id: I0c45293c10cf290ca43c381afccb48661062d553
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1920327
Reviewed-by: Zhenyao Mo <zmo@chromium.org>
Commit-Queue: Maggie Chen <magchen@chromium.org>
Cr-Commit-Position: refs/heads/master@{#716477}
  • Loading branch information
Maggie Chen authored and Commit Bot committed Nov 19, 2019
1 parent 2caaa92 commit 972a8d2
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 21 deletions.
47 changes: 45 additions & 2 deletions gpu/ipc/service/gpu_watchdog_thread_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "base/test/power_monitor_test_base.h"
#include "base/threading/thread_task_runner_handle.h"
#include "base/time/time.h"
#include "build/build_config.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace gpu {
Expand Down Expand Up @@ -136,9 +137,16 @@ void GpuWatchdogPowerTest::LongTaskOnResume(

// GPU Hang In Initialization
TEST_F(GpuWatchdogTest, GpuInitializationHang) {
// Gpu init (5000 ms) takes longer than timeout (2000 ms).
// GPU init takes longer than timeout.
#if defined(OS_WIN)
SimpleTask(kGpuWatchdogTimeoutForTesting * kInitFactor +
kGpuWatchdogTimeoutForTesting *
kMaxCountOfMoreGpuThreadTimeAllowed +
base::TimeDelta::FromMilliseconds(3000));
#else
SimpleTask(kGpuWatchdogTimeoutForTesting * kInitFactor +
base::TimeDelta::FromMilliseconds(3000));
#endif

// Gpu hangs. OnInitComplete() is not called

Expand Down Expand Up @@ -183,11 +191,20 @@ TEST_F(GpuWatchdogTest, GpuRunningATaskHang) {
// Report gpu init complete
watchdog_thread_->OnInitComplete();

// Start running a GPU task. This long task takes 6000 milliseconds to finish.
// Start running a GPU task.
#if defined(OS_WIN)
main_loop.task_runner()->PostTask(
FROM_HERE,
base::BindOnce(&SimpleTask, kGpuWatchdogTimeoutForTesting * 2 +
kGpuWatchdogTimeoutForTesting *
kMaxCountOfMoreGpuThreadTimeAllowed +
base::TimeDelta::FromMilliseconds(4000)));
#else
main_loop.task_runner()->PostTask(
FROM_HERE,
base::BindOnce(&SimpleTask, kGpuWatchdogTimeoutForTesting * 2 +
base::TimeDelta::FromMilliseconds(4000)));
#endif

main_loop.task_runner()->PostTask(FROM_HERE, run_loop.QuitClosure());
run_loop.Run();
Expand Down Expand Up @@ -228,6 +245,18 @@ TEST_F(GpuWatchdogTest, GpuSwitchingToForegroundHang) {
// A task stays in the background for 200 milliseconds, and then
// switches to the foreground and runs for 6000 milliseconds. This is longer
// than the first-time foreground watchdog timeout (2000 ms).
#if defined(OS_WIN)
main_loop.task_runner()->PostTask(
FROM_HERE,
base::BindOnce(&GpuWatchdogTest::LongTaskFromBackgroundToForeground,
base::Unretained(this),
/*duration*/ kGpuWatchdogTimeoutForTesting * 2 +
kGpuWatchdogTimeoutForTesting *
kMaxCountOfMoreGpuThreadTimeAllowed +
base::TimeDelta::FromMilliseconds(4200),
/*time_to_switch_to_foreground*/
base::TimeDelta::FromMilliseconds(200)));
#else
main_loop.task_runner()->PostTask(
FROM_HERE,
base::BindOnce(&GpuWatchdogTest::LongTaskFromBackgroundToForeground,
Expand All @@ -236,6 +265,7 @@ TEST_F(GpuWatchdogTest, GpuSwitchingToForegroundHang) {
base::TimeDelta::FromMilliseconds(4200),
/*time_to_switch_to_foreground*/
base::TimeDelta::FromMilliseconds(200)));
#endif

main_loop.task_runner()->PostTask(FROM_HERE, run_loop.QuitClosure());
run_loop.Run();
Expand Down Expand Up @@ -272,6 +302,18 @@ TEST_F(GpuWatchdogPowerTest, GpuOnResumeHang) {
// This task stays in the suspension mode for 200 milliseconds, and it
// wakes up on power resume and then runs for 6000 milliseconds. This is
// longer than the watchdog resume timeout (2000 ms).
#if defined(OS_WIN)
main_loop.task_runner()->PostTask(
FROM_HERE,
base::BindOnce(
&GpuWatchdogPowerTest::LongTaskOnResume, base::Unretained(this),
/*duration*/ kGpuWatchdogTimeoutForTesting * kRestartFactor +
kGpuWatchdogTimeoutForTesting *
kMaxCountOfMoreGpuThreadTimeAllowed +
base::TimeDelta::FromMilliseconds(4200),
/*time_to_power_resume*/
base::TimeDelta::FromMilliseconds(200)));
#else
main_loop.task_runner()->PostTask(
FROM_HERE,
base::BindOnce(
Expand All @@ -280,6 +322,7 @@ TEST_F(GpuWatchdogPowerTest, GpuOnResumeHang) {
base::TimeDelta::FromMilliseconds(4200),
/*time_to_power_resume*/
base::TimeDelta::FromMilliseconds(200)));
#endif

main_loop.task_runner()->PostTask(FROM_HERE, run_loop.QuitClosure());
run_loop.Run();
Expand Down
120 changes: 103 additions & 17 deletions gpu/ipc/service/gpu_watchdog_thread_v2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "base/atomicops.h"
#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/bit_cast.h"
#include "base/debug/alias.h"
#include "base/files/file_path.h"
#include "base/message_loop/message_loop_current.h"
Expand All @@ -21,7 +22,6 @@
#include "base/threading/platform_thread.h"
#include "base/threading/thread_task_runner_handle.h"
#include "base/time/time.h"
#include "build/build_config.h"
#include "gpu/config/gpu_crash_keys.h"

namespace gpu {
Expand Down Expand Up @@ -92,6 +92,17 @@ GpuWatchdogThreadImplV2::GpuWatchdogThreadImplV2(base::TimeDelta timeout,
is_test_mode_(is_test_mode),
watched_gpu_task_runner_(base::ThreadTaskRunnerHandle::Get()) {
base::MessageLoopCurrent::Get()->AddTaskObserver(this);
#if defined(OS_WIN)
// GetCurrentThread returns a pseudo-handle that cannot be used by one thread
// to identify another. DuplicateHandle creates a "real" handle that can be
// used for this purpose.
if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
GetCurrentProcess(), &watched_thread_handle_,
THREAD_QUERY_INFORMATION, FALSE, 0)) {
watched_thread_handle_ = nullptr;
}
#endif

Arm();
}

Expand All @@ -102,6 +113,10 @@ GpuWatchdogThreadImplV2::~GpuWatchdogThreadImplV2() {
base::MessageLoopCurrent::Get()->RemoveTaskObserver(this);
base::PowerMonitor::RemoveObserver(this);
GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogEnd);
#if defined(OS_WIN)
if (watched_thread_handle_)
CloseHandle(watched_thread_handle_);
#endif
}

// static
Expand Down Expand Up @@ -182,15 +197,23 @@ void GpuWatchdogThreadImplV2::Init() {

// Get and Invalidate weak_ptr should be done on the watchdog thread only.
weak_ptr_ = weak_factory_.GetWeakPtr();
base::TimeDelta timeout = watchdog_timeout_ * kInitFactor;
task_runner()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&GpuWatchdogThreadImplV2::OnWatchdogTimeout, weak_ptr_),
watchdog_timeout_ * kInitFactor);
timeout);

last_arm_disarm_counter_ = base::subtle::NoBarrier_Load(&arm_disarm_counter_);
watchdog_start_timeticks_ = base::TimeTicks::Now();
last_on_watchdog_timeout_timeticks_ = watchdog_start_timeticks_;
last_on_watchdog_timeout_time_ = base::Time::Now();
#if defined(OS_WIN)
if (watched_thread_handle_) {
if (base::ThreadTicks::IsSupported())
base::ThreadTicks::WaitUntilInitialized();
last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime();
remaining_watched_thread_ticks_ = timeout;
}
#endif

GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogStart);
}
Expand Down Expand Up @@ -283,12 +306,18 @@ void GpuWatchdogThreadImplV2::RestartWatchdogTimeoutTask() {
// Make the timeout twice long. The system/gpu might be very slow right
// after resume or foregrounded.
weak_ptr_ = weak_factory_.GetWeakPtr();
base::TimeDelta timeout = watchdog_timeout_ * kRestartFactor;
task_runner()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&GpuWatchdogThreadImplV2::OnWatchdogTimeout, weak_ptr_),
watchdog_timeout_ * kRestartFactor);
timeout);
last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now();
last_on_watchdog_timeout_time_ = base::Time::Now();
#if defined(OS_WIN)
if (watched_thread_handle_) {
last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime();
remaining_watched_thread_ticks_ = timeout;
}
#endif
}
}

Expand Down Expand Up @@ -336,15 +365,16 @@ void GpuWatchdogThreadImplV2::OnWatchdogTimeout() {
base::subtle::Atomic32 arm_disarm_counter =
base::subtle::NoBarrier_Load(&arm_disarm_counter_);

// disarmed is true if it's an even number.
bool disarmed = arm_disarm_counter % 2 == 0;
// Collect all needed info for gpu hang detection.
bool disarmed = arm_disarm_counter % 2 == 0; // even number
bool gpu_makes_progress = arm_disarm_counter != last_arm_disarm_counter_;
last_arm_disarm_counter_ = arm_disarm_counter;
bool gpu_thread_needs_more_time =
WatchedThreadNeedsMoreTime(disarmed || gpu_makes_progress);

// No gpu hang is detected. Continue with another OnWatchdogTimeout
if (disarmed || gpu_makes_progress) {
// No gpu hang is detected. Continue with another OnWatchdogTimeout task
if (disarmed || gpu_makes_progress || gpu_thread_needs_more_time) {
last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now();
last_on_watchdog_timeout_time_ = base::Time::Now();
is_first_timeout_after_power_resume = false;

task_runner()->PostDelayedTask(
Expand All @@ -366,6 +396,66 @@ bool GpuWatchdogThreadImplV2::GpuIsAlive() {
return (gpu_makes_progress);
}

bool GpuWatchdogThreadImplV2::WatchedThreadNeedsMoreTime(
bool no_gpu_hang_detected) {
#if defined(OS_WIN)
if (!watched_thread_handle_)
return false;

base::ThreadTicks now = GetWatchedThreadTime();
base::TimeDelta thread_time_elapsed =
now - last_on_watchdog_timeout_thread_ticks_;
last_on_watchdog_timeout_thread_ticks_ = now;
remaining_watched_thread_ticks_ -= thread_time_elapsed;

if (no_gpu_hang_detected ||
count_of_more_gpu_thread_time_allowed >=
kMaxCountOfMoreGpuThreadTimeAllowed ||
thread_time_elapsed < base::TimeDelta() /* bogus data */ ||
remaining_watched_thread_ticks_ <= base::TimeDelta()) {
// Reset the remaining thread ticks.
remaining_watched_thread_ticks_ = watchdog_timeout_;
count_of_more_gpu_thread_time_allowed = 0;
return false;
} else {
count_of_more_gpu_thread_time_allowed++;
return true;
}
#else
return false;
#endif
}

#if defined(OS_WIN)
base::ThreadTicks GpuWatchdogThreadImplV2::GetWatchedThreadTime() {
DCHECK(watched_thread_handle_);

if (base::ThreadTicks::IsSupported()) {
// Note: GetForThread() might return bogus results if running on different
// CPUs between two calls.
return base::ThreadTicks::GetForThread(
base::PlatformThreadHandle(watched_thread_handle_));
} else {
FILETIME creation_time;
FILETIME exit_time;
FILETIME kernel_time;
FILETIME user_time;
BOOL result = GetThreadTimes(watched_thread_handle_, &creation_time,
&exit_time, &kernel_time, &user_time);
if (!result)
return base::ThreadTicks();

// Need to bit_cast to fix alignment, then divide by 10 to convert
// 100-nanoseconds to microseconds.
int64_t user_time_us = bit_cast<int64_t, FILETIME>(user_time) / 10;
int64_t kernel_time_us = bit_cast<int64_t, FILETIME>(kernel_time) / 10;

return base::ThreadTicks() +
base::TimeDelta::FromMicroseconds(user_time_us + kernel_time_us);
}
}
#endif

void GpuWatchdogThreadImplV2::DeliberatelyTerminateToRecoverFromHang() {
DCHECK(watchdog_thread_task_runner_->BelongsToCurrentThread());
// If this is for gpu testing, do not terminate the gpu process.
Expand Down Expand Up @@ -400,13 +490,9 @@ void GpuWatchdogThreadImplV2::DeliberatelyTerminateToRecoverFromHang() {
function_begin_timeticks - last_on_watchdog_timeout_timeticks_;
base::debug::Alias(&timeticks_elapses);

// If clock_time_elapses is much longer than time_elapses, it might be a sign
// of a busy system.
base::Time current_time = base::Time::Now();
base::TimeDelta time_elapses = current_time - last_on_watchdog_timeout_time_;
base::debug::Alias(&current_time);
base::debug::Alias(&last_on_watchdog_timeout_time_);
base::debug::Alias(&time_elapses);
#if defined(OS_WIN)
base::debug::Alias(&remaining_watched_thread_ticks_);
#endif

GpuWatchdogHistogram(GpuWatchdogThreadEvent::kGpuWatchdogKill);

Expand Down
28 changes: 26 additions & 2 deletions gpu/ipc/service/gpu_watchdog_thread_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
#include "gpu/ipc/service/gpu_watchdog_thread.h"

namespace gpu {
#if defined(OS_WIN)
// If the actual time the watched GPU thread spent doing actual work is less
// than the wathdog timeout, the GPU thread can continue running through
// OnGPUWatchdogTimeout for at most 4 times before the gpu thread is killed.
constexpr int kMaxCountOfMoreGpuThreadTimeAllowed = 4;
#endif

class GPU_IPC_SERVICE_EXPORT GpuWatchdogThreadImplV2
: public GpuWatchdogThread,
Expand Down Expand Up @@ -58,6 +64,10 @@ class GPU_IPC_SERVICE_EXPORT GpuWatchdogThreadImplV2
bool IsArmed();
void OnWatchdogTimeout();
bool GpuIsAlive();
bool WatchedThreadNeedsMoreTime(bool no_gpu_hang_detected);
#if defined(OS_WIN)
base::ThreadTicks GetWatchedThreadTime();
#endif

// Do not change the function name. It is used for [GPU HANG] carsh reports.
void DeliberatelyTerminateToRecoverFromHang();
Expand All @@ -82,13 +92,27 @@ class GPU_IPC_SERVICE_EXPORT GpuWatchdogThreadImplV2
base::TimeTicks backgrounded_timeticks_;
base::TimeTicks foregrounded_timeticks_;

// Time: Interpreting the wall-clock time provided by a remote system.
// TimeTicks: Tracking the amount of time a task runs. Executing delayed
// tasks at the right time.
// ThreadTicks: Use this timer to (approximately) measure how much time the
// calling thread spent doing actual work vs. being de-scheduled.

// The time the last OnWatchdogTimeout() was called.
base::TimeTicks last_on_watchdog_timeout_timeticks_;
base::Time last_on_watchdog_timeout_time_;
#if defined(OS_WIN)
base::ThreadTicks last_on_watchdog_timeout_thread_ticks_;

// The difference between the timeout and the actual time the watched thread
// spent doing actual work.
base::TimeDelta remaining_watched_thread_ticks_;

// The Windows thread hanndle of the watched GPU main thread.
void* watched_thread_handle_ = nullptr;

// After GPU hang detected, how many times has the GPU thread been allowed to
// continue due to not enough thread time.
int count_of_more_gpu_thread_time_allowed = 0;
#endif

// The system has entered the power suspension mode.
bool in_power_suspension_ = false;
Expand Down

0 comments on commit 972a8d2

Please sign in to comment.