Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not kill frontend isolate on uncaught exceptions. #4672

Merged
merged 3 commits into from
Mar 30, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 53 additions & 10 deletions app/lib/service/entrypoint/_isolate.dart
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Future startIsolates({
final stampFile =
File(p.join(Directory.systemTemp.path, 'pub-dev-started.stamp'));
if (stampFile.existsSync()) {
print('[warning-service-restarted]: '
stderr.writeln('[warning-service-restarted]: '
'${stampFile.path} already exists, indicating that this process has been restarted.');
} else {
stampFile.createSync(recursive: true);
Expand All @@ -77,15 +77,20 @@ Future startIsolates({
_setupServiceIsolate();

int frontendStarted = 0;

/// The duration while errors won't cause frontend isolates to restart.
var restartProtectionOffset = Duration.zero;
var lastStarted = DateTime.now();
int workerStarted = 0;
final statConsumerPorts = <SendPort>[];

Future<void> startFrontendIsolate() async {
frontendStarted++;
final frontendIndex = frontendStarted;
logger.info('About to start frontend isolate #$frontendIndex...');
final ReceivePort errorReceivePort = ReceivePort();
final ReceivePort protocolReceivePort = ReceivePort();
final errorReceivePort = ReceivePort();
final exitReceivePort = ReceivePort();
final protocolReceivePort = ReceivePort();
await Isolate.spawn(
_wrapper,
[
Expand All @@ -96,7 +101,7 @@ Future startIsolates({
),
],
onError: errorReceivePort.sendPort,
onExit: errorReceivePort.sendPort,
onExit: exitReceivePort.sendPort,
errorsAreFatal: true,
);
final protocolMessage = (await protocolReceivePort.take(1).toList())
Expand All @@ -105,25 +110,63 @@ Future startIsolates({
statConsumerPorts.add(protocolMessage.statsConsumerPort);
}
logger.info('Frontend isolate #$frontendIndex started.');
lastStarted = DateTime.now();

StreamSubscription errorSubscription;
StreamSubscription exitSubscription;

Future<void> close() async {
if (protocolMessage.statsConsumerPort != null) {
statConsumerPorts.remove(protocolMessage.statsConsumerPort);
}
await errorSubscription?.cancel();
await exitSubscription?.cancel();
errorReceivePort.close();
exitReceivePort.close();
protocolReceivePort.close();
}

errorSubscription = errorReceivePort.listen((e) async {
print('ERROR from frontend isolate #$frontendIndex: $e');
logger.severe('ERROR from frontend isolate #$frontendIndex', e);
Future<void> restart() async {
await close();
// restart isolate after a brief pause
await Future.delayed(Duration(seconds: 5));
// Restart the isolate after a pause, increasing the pause duration at
// each restart.
//
// NOTE: As this wait period increases, the service may miss /liveness_check
// requests, and eventually AppEngine may just kill the instance
// marking it unreachable.
await Future.delayed(Duration(seconds: 5 + frontendStarted));
await startFrontendIsolate();
}

errorSubscription = errorReceivePort.listen((e) async {
stderr.writeln('ERROR from frontend isolate #$frontendIndex: $e');
logger.severe('ERROR from frontend isolate #$frontendIndex', e);

final now = DateTime.now();
// If the last isolate was started more than an hour ago, we can reset
// the protection.
if (now.isAfter(lastStarted.add(Duration(hours: 1)))) {
restartProtectionOffset = Duration.zero;
}

// If we have recently restarted an isolate, let's keep it running.
if (now.isBefore(lastStarted.add(restartProtectionOffset))) {
return;
}

// Extend restart protection for up to 20 minutes.
if (restartProtectionOffset.inMinutes < 20) {
restartProtectionOffset += Duration(minutes: 4);
}

await restart();
});

exitSubscription = exitReceivePort.listen((e) async {
stderr.writeln(
'Frontend isolate #$frontendIndex exited with message: $e');
logger.warning('Frontend isolate #$frontendIndex exited.', e);
await restart();
});
}

Expand Down Expand Up @@ -195,7 +238,7 @@ Future startIsolates({
}

errorSubscription = errorReceivePort.listen((e) async {
print('ERROR from worker isolate #$workerIndex: $e');
stderr.writeln('ERROR from worker isolate #$workerIndex: $e');
logger.severe('ERROR from worker isolate #$workerIndex', e);
await close();
// restart isolate after a brief pause
Expand Down