Skip to content

Commit

Permalink
Fix for possible false-positive GC delays (#8483)
Browse files Browse the repository at this point in the history
* Fix for possible false-positive GC delays

* rename threads-vars-methods to describe scope

* rename gc to stall

* minor overhaul for watchdog

* Demote "Starting Silo watchdog" log level from Information to Debug

---------

Co-authored-by: Reuben Bond <rebond@microsoft.com>
  • Loading branch information
ntovas and ReubenBond committed Jun 14, 2023
1 parent a027318 commit e961202
Showing 1 changed file with 103 additions and 73 deletions.
176 changes: 103 additions & 73 deletions src/Orleans.Runtime/Silo/Watchdog.cs
@@ -1,3 +1,4 @@
#nullable enable
using System;
using System.Collections.Generic;
using System.Text;
Expand All @@ -6,150 +7,179 @@

namespace Orleans.Runtime
{
/// <summary>
/// Monitors runtime and component health periodically, reporting complaints.
/// </summary>
internal class Watchdog
{
private readonly CancellationTokenSource cancellation = new CancellationTokenSource();
private static readonly TimeSpan heartbeatPeriod = TimeSpan.FromMilliseconds(1000);
private readonly TimeSpan healthCheckPeriod;
private DateTime lastHeartbeat;
private DateTime lastWatchdogCheck;
private static readonly TimeSpan PlatformWatchdogHeartbeatPeriod = TimeSpan.FromMilliseconds(1000);
private readonly CancellationTokenSource _cancellation = new CancellationTokenSource();
private readonly TimeSpan _componentHealthCheckPeriod;
private readonly List<IHealthCheckParticipant> _participants;
private readonly ILogger _logger;
private ValueStopwatch _platformWatchdogStopwatch;
private ValueStopwatch _componentWatchdogStopwatch;

// GC pause duration since process start.
private TimeSpan cumulativeGCPauseDuration;
private TimeSpan _cumulativeGCPauseDuration;

private readonly List<IHealthCheckParticipant> participants;
private readonly ILogger logger;
private Thread thread;
private DateTime _lastComponentHealthCheckTime;
private Thread? _platformWatchdogThread;
private Thread? _componentWatchdogThread;

public Watchdog(TimeSpan watchdogPeriod, List<IHealthCheckParticipant> watchables, ILogger<Watchdog> logger)
public Watchdog(TimeSpan watchdogPeriod, List<IHealthCheckParticipant> participants, ILogger<Watchdog> logger)
{
this.logger = logger;
healthCheckPeriod = watchdogPeriod;
participants = watchables;
_logger = logger;
_componentHealthCheckPeriod = watchdogPeriod;
_participants = participants;
}

public void Start()
{
logger.LogInformation("Starting Silo Watchdog.");
if (_logger.IsEnabled(LogLevel.Debug))
{
_logger.LogDebug("Starting Silo watchdog");
}

if (thread is not null)
if (_platformWatchdogThread is not null)
{
throw new InvalidOperationException("Watchdog.Start may not be called more than once");
}

var now = DateTime.UtcNow;
lastHeartbeat = now;
lastWatchdogCheck = now;
cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
_platformWatchdogStopwatch = ValueStopwatch.StartNew();
_cumulativeGCPauseDuration = GC.GetTotalPauseDuration();

_platformWatchdogThread = new Thread(RunPlatformWatchdog)
{
IsBackground = true,
Name = "Orleans.Runtime.Watchdog.Platform",
};
_platformWatchdogThread.Start();

this.thread = new Thread(this.Run)
_componentWatchdogStopwatch = ValueStopwatch.StartNew();
_lastComponentHealthCheckTime = DateTime.UtcNow;

_componentWatchdogThread = new Thread(RunComponentWatchdog)
{
IsBackground = true,
Name = "Orleans.Runtime.Watchdog",
Name = "Orleans.Runtime.Watchdog.Component",
};
this.thread.Start();
_componentWatchdogThread.Start();
}

public void Stop()
{
cancellation.Cancel();
_cancellation.Cancel();
}

protected void Run()
protected void RunPlatformWatchdog()
{
while (!this.cancellation.IsCancellationRequested)
while (!_cancellation.IsCancellationRequested)
{
try
{
WatchdogHeartbeatTick();
Thread.Sleep(heartbeatPeriod);
}
catch (ThreadAbortException)
{
// Silo is probably shutting-down, so just ignore and exit
CheckRuntimeHealth();
}
catch (Exception exc)
{
logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Watchdog encountered an internal error");
_logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Platform watchdog encountered an internal error");
}

_platformWatchdogStopwatch.Restart();
_cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
Thread.Sleep(PlatformWatchdogHeartbeatPeriod);
}
}

private void WatchdogHeartbeatTick()
private void CheckRuntimeHealth()
{
try
var pauseDurationSinceLastTick = GC.GetTotalPauseDuration() - _cumulativeGCPauseDuration;
var timeSinceLastTick = _platformWatchdogStopwatch.Elapsed;
if (timeSinceLastTick > PlatformWatchdogHeartbeatPeriod.Multiply(2))
{
CheckYourOwnHealth(lastHeartbeat, cumulativeGCPauseDuration, logger);
var gc = new[] { GC.CollectionCount(0), GC.CollectionCount(1), GC.CollectionCount(2) };
_logger.LogWarning(
(int)ErrorCode.SiloHeartbeatTimerStalled,
".NET Runtime Platform stalled for {TimeSinceLastTick}. Total GC Pause duration during that period: {pauseDurationSinceLastTick}. We are now using a total of {TotalMemory}MB memory. Collection counts per generation: 0: {GCGen0Count}, 1: {GCGen1Count}, 2: {GCGen2Count}",
timeSinceLastTick,
pauseDurationSinceLastTick,
GC.GetTotalMemory(false) / (1024 * 1024),
gc[0],
gc[1],
gc[2]);
}
finally

var timeSinceLastParticipantCheck = _componentWatchdogStopwatch.Elapsed;
if (timeSinceLastParticipantCheck > _componentHealthCheckPeriod.Multiply(2))
{
lastHeartbeat = DateTime.UtcNow;
cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
_logger.LogWarning(
(int)ErrorCode.SiloHeartbeatTimerStalled,
"Participant check thread has not completed for {TimeSinceLastTick}, potentially indicating lock contention or deadlock, CPU starvation, or another execution anomaly.",
timeSinceLastParticipantCheck);
}
}

var timeSinceLastWatchdogCheck = DateTime.UtcNow - lastWatchdogCheck;
if (timeSinceLastWatchdogCheck <= healthCheckPeriod)
protected void RunComponentWatchdog()
{
while (!_cancellation.IsCancellationRequested)
{
return;
try
{
CheckComponentHealth();
}
catch (Exception exc)
{
_logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Component health check encountered an internal error");
}

_componentWatchdogStopwatch.Restart();
Thread.Sleep(_componentHealthCheckPeriod);
}
}

private void CheckComponentHealth()
{
WatchdogInstruments.HealthChecks.Add(1);
int numFailedChecks = 0;
StringBuilder reasons = null;
foreach (IHealthCheckParticipant participant in participants)
var numFailedChecks = 0;
StringBuilder? complaints = null;

// Restart the timer before the check to reduce false positives for the stall checker.
_componentWatchdogStopwatch.Restart();
foreach (var participant in _participants)
{
try
{
bool ok = participant.CheckHealth(lastWatchdogCheck, out var reason);
var ok = participant.CheckHealth(_lastComponentHealthCheckTime, out var complaint);
if (!ok)
{
reasons ??= new StringBuilder();
if (reasons.Length > 0)
complaints ??= new StringBuilder();
if (complaints.Length > 0)
{
reasons.Append(' ');
complaints.Append(' ');
}

reasons.Append($"{participant.GetType()} failed health check with reason \"{reason}\".");
numFailedChecks++;
complaints.Append($"{participant.GetType()} failed health check with complaint \"{complaint}\".");
++numFailedChecks;
}
}
catch (Exception exc)
{
logger.LogWarning(
_logger.LogWarning(
(int)ErrorCode.Watchdog_ParticipantThrownException,
exc,
"Health check participant {Participant} has thrown an exception from its CheckHealth method.",
participant.GetType());
}
}

if (numFailedChecks > 0)
if (complaints != null)
{
WatchdogInstruments.FailedHealthChecks.Add(1);
logger.LogWarning((int)ErrorCode.Watchdog_HealthCheckFailure, "Watchdog had {FailedChecks} health Check failure(s) out of {ParticipantCount} health Check participants: {Reasons}", numFailedChecks, participants.Count, reasons.ToString());
_logger.LogWarning((int)ErrorCode.Watchdog_HealthCheckFailure, "{FailedChecks} of {ParticipantCount} components reported issues. Complaints: {Complaints}", numFailedChecks, _participants.Count, complaints);
}

lastWatchdogCheck = DateTime.UtcNow;
}

private static void CheckYourOwnHealth(DateTime lastCheckTime, TimeSpan lastCumulativeGCPauseDuration, ILogger logger)
{
var timeSinceLastTick = DateTime.UtcNow - lastCheckTime;
var pauseDurationSinceLastTick = GC.GetTotalPauseDuration() - lastCumulativeGCPauseDuration;
if (timeSinceLastTick > heartbeatPeriod.Multiply(2))
{
var gc = new[] { GC.CollectionCount(0), GC.CollectionCount(1), GC.CollectionCount(2) };
logger.LogWarning(
(int)ErrorCode.SiloHeartbeatTimerStalled,
".NET Runtime Platform stalled for {TimeSinceLastTick}. Total GC Pause duration during that period: {pauseDurationSinceLastTick}. We are now using a total of {TotalMemory}MB memory. gc={GCGen0Count}, {GCGen1Count}, {GCGen2Count}",
timeSinceLastTick,
pauseDurationSinceLastTick,
GC.GetTotalMemory(false) / (1024 * 1024),
gc[0],
gc[1],
gc[2]);
}
_lastComponentHealthCheckTime = DateTime.UtcNow;
}
}
}

0 comments on commit e961202

Please sign in to comment.