Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for possible false-positive GC delays #8483

Merged
merged 5 commits into from Jun 14, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
176 changes: 103 additions & 73 deletions src/Orleans.Runtime/Silo/Watchdog.cs
@@ -1,3 +1,4 @@
#nullable enable
using System;
using System.Collections.Generic;
using System.Text;
Expand All @@ -6,150 +7,179 @@

namespace Orleans.Runtime
{
/// <summary>
/// Monitors runtime and component health periodically, reporting complaints.
/// </summary>
internal class Watchdog
{
private readonly CancellationTokenSource cancellation = new CancellationTokenSource();
private static readonly TimeSpan heartbeatPeriod = TimeSpan.FromMilliseconds(1000);
private readonly TimeSpan healthCheckPeriod;
private DateTime lastHeartbeat;
private DateTime lastWatchdogCheck;
private static readonly TimeSpan PlatformWatchdogHeartbeatPeriod = TimeSpan.FromMilliseconds(1000);
private readonly CancellationTokenSource _cancellation = new CancellationTokenSource();
private readonly TimeSpan _componentHealthCheckPeriod;
private readonly List<IHealthCheckParticipant> _participants;
private readonly ILogger _logger;
private ValueStopwatch _platformWatchdogStopwatch;
private ValueStopwatch _componentWatchdogStopwatch;

// GC pause duration since process start.
private TimeSpan cumulativeGCPauseDuration;
private TimeSpan _cumulativeGCPauseDuration;

private readonly List<IHealthCheckParticipant> participants;
private readonly ILogger logger;
private Thread thread;
private DateTime _lastComponentHealthCheckTime;
private Thread? _platformWatchdogThread;
private Thread? _componentWatchdogThread;

public Watchdog(TimeSpan watchdogPeriod, List<IHealthCheckParticipant> watchables, ILogger<Watchdog> logger)
public Watchdog(TimeSpan watchdogPeriod, List<IHealthCheckParticipant> participants, ILogger<Watchdog> logger)
{
this.logger = logger;
healthCheckPeriod = watchdogPeriod;
participants = watchables;
_logger = logger;
_componentHealthCheckPeriod = watchdogPeriod;
_participants = participants;
}

public void Start()
{
logger.LogInformation("Starting Silo Watchdog.");
if (_logger.IsEnabled(LogLevel.Debug))
{
_logger.LogDebug("Starting Silo watchdog");
}

if (thread is not null)
if (_platformWatchdogThread is not null)
{
throw new InvalidOperationException("Watchdog.Start may not be called more than once");
}

var now = DateTime.UtcNow;
lastHeartbeat = now;
lastWatchdogCheck = now;
cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
_platformWatchdogStopwatch = ValueStopwatch.StartNew();
_cumulativeGCPauseDuration = GC.GetTotalPauseDuration();

_platformWatchdogThread = new Thread(RunPlatformWatchdog)
{
IsBackground = true,
Name = "Orleans.Runtime.Watchdog.Platform",
};
_platformWatchdogThread.Start();

this.thread = new Thread(this.Run)
_componentWatchdogStopwatch = ValueStopwatch.StartNew();
_lastComponentHealthCheckTime = DateTime.UtcNow;

_componentWatchdogThread = new Thread(RunComponentWatchdog)
{
IsBackground = true,
Name = "Orleans.Runtime.Watchdog",
Name = "Orleans.Runtime.Watchdog.Component",
};
this.thread.Start();
_componentWatchdogThread.Start();
}

public void Stop()
{
cancellation.Cancel();
_cancellation.Cancel();
}

protected void Run()
protected void RunPlatformWatchdog()
{
while (!this.cancellation.IsCancellationRequested)
while (!_cancellation.IsCancellationRequested)
{
try
{
WatchdogHeartbeatTick();
Thread.Sleep(heartbeatPeriod);
}
catch (ThreadAbortException)
{
// Silo is probably shutting-down, so just ignore and exit
CheckRuntimeHealth();
}
catch (Exception exc)
{
logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Watchdog encountered an internal error");
_logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Platform watchdog encountered an internal error");
}

_platformWatchdogStopwatch.Restart();
_cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
Thread.Sleep(PlatformWatchdogHeartbeatPeriod);
}
}

private void WatchdogHeartbeatTick()
private void CheckRuntimeHealth()
{
try
var pauseDurationSinceLastTick = GC.GetTotalPauseDuration() - _cumulativeGCPauseDuration;
var timeSinceLastTick = _platformWatchdogStopwatch.Elapsed;
if (timeSinceLastTick > PlatformWatchdogHeartbeatPeriod.Multiply(2))
{
CheckYourOwnHealth(lastHeartbeat, cumulativeGCPauseDuration, logger);
var gc = new[] { GC.CollectionCount(0), GC.CollectionCount(1), GC.CollectionCount(2) };
_logger.LogWarning(
(int)ErrorCode.SiloHeartbeatTimerStalled,
".NET Runtime Platform stalled for {TimeSinceLastTick}. Total GC Pause duration during that period: {pauseDurationSinceLastTick}. We are now using a total of {TotalMemory}MB memory. Collection counts per generation: 0: {GCGen0Count}, 1: {GCGen1Count}, 2: {GCGen2Count}",
timeSinceLastTick,
pauseDurationSinceLastTick,
GC.GetTotalMemory(false) / (1024 * 1024),
gc[0],
gc[1],
gc[2]);
}
finally

var timeSinceLastParticipantCheck = _componentWatchdogStopwatch.Elapsed;
if (timeSinceLastParticipantCheck > _componentHealthCheckPeriod.Multiply(2))
{
lastHeartbeat = DateTime.UtcNow;
cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
_logger.LogWarning(
(int)ErrorCode.SiloHeartbeatTimerStalled,
"Participant check thread has not completed for {TimeSinceLastTick}, potentially indicating lock contention or deadlock, CPU starvation, or another execution anomaly.",
timeSinceLastParticipantCheck);
}
}

var timeSinceLastWatchdogCheck = DateTime.UtcNow - lastWatchdogCheck;
if (timeSinceLastWatchdogCheck <= healthCheckPeriod)
protected void RunComponentWatchdog()
{
while (!_cancellation.IsCancellationRequested)
{
return;
try
{
CheckComponentHealth();
}
catch (Exception exc)
{
_logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Component health check encountered an internal error");
}

_componentWatchdogStopwatch.Restart();
Thread.Sleep(_componentHealthCheckPeriod);
}
}

private void CheckComponentHealth()
{
WatchdogInstruments.HealthChecks.Add(1);
int numFailedChecks = 0;
StringBuilder reasons = null;
foreach (IHealthCheckParticipant participant in participants)
var numFailedChecks = 0;
StringBuilder? complaints = null;

// Restart the timer before the check to reduce false positives for the stall checker.
_componentWatchdogStopwatch.Restart();
foreach (var participant in _participants)
{
try
{
bool ok = participant.CheckHealth(lastWatchdogCheck, out var reason);
var ok = participant.CheckHealth(_lastComponentHealthCheckTime, out var complaint);
if (!ok)
{
reasons ??= new StringBuilder();
if (reasons.Length > 0)
complaints ??= new StringBuilder();
if (complaints.Length > 0)
{
reasons.Append(' ');
complaints.Append(' ');
}

reasons.Append($"{participant.GetType()} failed health check with reason \"{reason}\".");
numFailedChecks++;
complaints.Append($"{participant.GetType()} failed health check with complaint \"{complaint}\".");
++numFailedChecks;
}
}
catch (Exception exc)
{
logger.LogWarning(
_logger.LogWarning(
(int)ErrorCode.Watchdog_ParticipantThrownException,
exc,
"Health check participant {Participant} has thrown an exception from its CheckHealth method.",
participant.GetType());
}
}

if (numFailedChecks > 0)
if (complaints != null)
{
WatchdogInstruments.FailedHealthChecks.Add(1);
logger.LogWarning((int)ErrorCode.Watchdog_HealthCheckFailure, "Watchdog had {FailedChecks} health Check failure(s) out of {ParticipantCount} health Check participants: {Reasons}", numFailedChecks, participants.Count, reasons.ToString());
_logger.LogWarning((int)ErrorCode.Watchdog_HealthCheckFailure, "{FailedChecks} of {ParticipantCount} components reported issues. Complaints: {Complaints}", numFailedChecks, _participants.Count, complaints);
}

lastWatchdogCheck = DateTime.UtcNow;
}

private static void CheckYourOwnHealth(DateTime lastCheckTime, TimeSpan lastCumulativeGCPauseDuration, ILogger logger)
{
var timeSinceLastTick = DateTime.UtcNow - lastCheckTime;
var pauseDurationSinceLastTick = GC.GetTotalPauseDuration() - lastCumulativeGCPauseDuration;
if (timeSinceLastTick > heartbeatPeriod.Multiply(2))
{
var gc = new[] { GC.CollectionCount(0), GC.CollectionCount(1), GC.CollectionCount(2) };
logger.LogWarning(
(int)ErrorCode.SiloHeartbeatTimerStalled,
".NET Runtime Platform stalled for {TimeSinceLastTick}. Total GC Pause duration during that period: {pauseDurationSinceLastTick}. We are now using a total of {TotalMemory}MB memory. gc={GCGen0Count}, {GCGen1Count}, {GCGen2Count}",
timeSinceLastTick,
pauseDurationSinceLastTick,
GC.GetTotalMemory(false) / (1024 * 1024),
gc[0],
gc[1],
gc[2]);
}
_lastComponentHealthCheckTime = DateTime.UtcNow;
}
}
}