Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for possible false-positive GC delays #8483

Merged
merged 5 commits into from Jun 14, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
57 changes: 44 additions & 13 deletions src/Orleans.Runtime/Silo/Watchdog.cs
Expand Up @@ -9,7 +9,7 @@ namespace Orleans.Runtime
internal class Watchdog
{
private readonly CancellationTokenSource cancellation = new CancellationTokenSource();
private static readonly TimeSpan heartbeatPeriod = TimeSpan.FromMilliseconds(1000);
private static readonly TimeSpan gcHeartbeatPeriod = TimeSpan.FromMilliseconds(1000);
private readonly TimeSpan healthCheckPeriod;
private DateTime lastHeartbeat;
private DateTime lastWatchdogCheck;
Expand All @@ -19,7 +19,8 @@ internal class Watchdog

private readonly List<IHealthCheckParticipant> participants;
private readonly ILogger logger;
private Thread thread;
private Thread gcThread;
private Thread participantsThread;

public Watchdog(TimeSpan watchdogPeriod, List<IHealthCheckParticipant> watchables, ILogger<Watchdog> logger)
{
Expand All @@ -32,37 +33,45 @@ public void Start()
{
logger.LogInformation("Starting Silo Watchdog.");

if (thread is not null)
if (gcThread is not null)
{
throw new InvalidOperationException("Watchdog.Start may not be called more than once");
}

var now = DateTime.UtcNow;
lastHeartbeat = now;
lastWatchdogCheck = now;
cumulativeGCPauseDuration = GC.GetTotalPauseDuration();

this.thread = new Thread(this.Run)
this.gcThread = new Thread(this.RunGCCheck)
{
IsBackground = true,
Name = "Orleans.Runtime.Watchdog.GCMonitor",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GCMonitor is a misnomer, since it is monitoring for execution stalls in general. Maybe call it StallMonitor or RuntimeMonitor

};
this.gcThread.Start();

lastWatchdogCheck = now;

this.participantsThread = new Thread(this.RunParticipantsCheck)
{
IsBackground = true,
Name = "Orleans.Runtime.Watchdog",
Name = "Orleans.Runtime.Watchdog.ParticipantsMonitor",
};
this.thread.Start();
this.participantsThread.Start();
}

public void Stop()
{
cancellation.Cancel();
}

protected void Run()
protected void RunGCCheck()
{
while (!this.cancellation.IsCancellationRequested)
{
try
{
WatchdogHeartbeatTick();
Thread.Sleep(heartbeatPeriod);
GCWatchdogHeartbeatTick();
Thread.Sleep(gcHeartbeatPeriod);
}
catch (ThreadAbortException)
{
Expand All @@ -75,7 +84,27 @@ protected void Run()
}
}

private void WatchdogHeartbeatTick()
protected void RunParticipantsCheck()
{
while (!this.cancellation.IsCancellationRequested)
{
try
{
ParticipantsWatchdogHeartbeatTick();
Thread.Sleep(healthCheckPeriod);
}
catch (ThreadAbortException)
{
// Silo is probably shutting-down, so just ignore and exit
}
catch (Exception exc)
{
logger.LogError((int)ErrorCode.Watchdog_InternalError, exc, "Watchdog encountered an internal error");
}
}
}

private void GCWatchdogHeartbeatTick()
{
try
{
Expand All @@ -86,7 +115,10 @@ private void WatchdogHeartbeatTick()
lastHeartbeat = DateTime.UtcNow;
cumulativeGCPauseDuration = GC.GetTotalPauseDuration();
}
}

private void ParticipantsWatchdogHeartbeatTick()
{
var timeSinceLastWatchdogCheck = DateTime.UtcNow - lastWatchdogCheck;
if (timeSinceLastWatchdogCheck <= healthCheckPeriod)
{
Expand Down Expand Up @@ -136,7 +168,7 @@ private static void CheckYourOwnHealth(DateTime lastCheckTime, TimeSpan lastCumu
{
var timeSinceLastTick = DateTime.UtcNow - lastCheckTime;
var pauseDurationSinceLastTick = GC.GetTotalPauseDuration() - lastCumulativeGCPauseDuration;
if (timeSinceLastTick > heartbeatPeriod.Multiply(2))
if (timeSinceLastTick > gcHeartbeatPeriod.Multiply(2))
{
var gc = new[] { GC.CollectionCount(0), GC.CollectionCount(1), GC.CollectionCount(2) };
logger.LogWarning(
Expand All @@ -152,4 +184,3 @@ private static void CheckYourOwnHealth(DateTime lastCheckTime, TimeSpan lastCumu
}
}
}