-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Keep supervisor children linked during shutdown #5201
Keep supervisor children linked during shutdown #5201
Conversation
lib/stdlib/src/supervisor.erl
Outdated
case unlink_flush(Pid, Reason0) of | ||
shutdown -> | ||
ok; | ||
{shutdown, _} -> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As {shutdown, Reason} is a way for the child process to terminate itself normally (gracefully but with a trail as to why). Maybe it should also have a guard not (is_pernmanent(Child)). I must confess I actually have not used permanent children. But that would be backwards compatible and it seems to make sense that if we do not accept normal as a shutdown reason {shutdown, Reason} should not be accepted either and only the shutdown initiated by the supervisor should be considered a non-fault.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense I think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed as suggested in last commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
8b0b87d
to
6faf285
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good!
However, there is poor coverage in some cases in the tests, so test-case improvements would be welcome. The brutal_kill
scenario for dynamic workers is not tested at all. Children terminating simultaneously by them selves are also poorly cover. This can, however, be hard to trigger.
921 | | shutdown(#child{pid=Pid, shutdown=brutal_kill} = Child) ->
-- | -- | --
922 | 11 | Mon = monitor(process, Pid),
923 | 11 | exit(Pid, kill),
924 | 11 | receive
925 | | {'DOWN', Mon, process, Pid, Reason0} ->
926 | 11 | case unlink_flush(Pid, Reason0) of
927 | | killed ->
928 | 11 | ok;
929 | | normal when not (?is_permanent(Child)) ->
930 | :-( | ok;
931 | | Reason ->
932 | :-( | {error, Reason}
933 | | end
934 | | end;
935 | | shutdown(#child{pid=Pid, shutdown=Time} = Child) ->
936 | 112083 | Mon = monitor(process, Pid),
937 | 112083 | exit(Pid, shutdown),
938 | 112083 | receive
939 | | {'DOWN', Mon, process, Pid, Reason0} ->
940 | 112080 | case unlink_flush(Pid, Reason0) of
941 | | shutdown ->
942 | 112080 | ok;
943 | | {shutdown, _} ->
944 | :-( | ok;
945 | | normal when not (?is_permanent(Child)) ->
946 | :-( | ok;
947 | | Reason ->
948 | :-( | {error, Reason}
949 | | end
950 | | after Time ->
951 | 1 | exit(Pid, kill),
952 | 1 | receive
953 | | {'DOWN', Mon, process, Pid, Reason0} ->
954 | 1 | case unlink_flush(Pid, Reason0) of
955 | | shutdown ->
956 | :-( | ok;
957 | | {shutdown, _} ->
958 | :-( | ok;
959 | | normal when not (?is_permanent(Child)) ->
960 | :-( | ok;
961 | | Reason ->
962 | 1 | {error, Reason}
963 | | end
964 | | end
965 | | end.
966 | |
967 | | unlink_flush(Pid, DefaultReason) ->
968 | 112405 | unlink(Pid),
969 | 112405 | receive
970 | | {'EXIT', Pid, Reason} ->
971 | 112404 | Reason
972 | | after 0 ->
973 | 1 | DefaultReason
974 | | end.
975 | |
976 | | %%-----------------------------------------------------------------
977 | | %% Func: terminate_dynamic_children/1
978 | | %% Args: State
979 | | %% Returns: ok
980 | | %%
981 | | %% Shutdown all dynamic children. This happens when the supervisor is
982 | | %% stopped. Because the supervisor can have millions of dynamic children, we
983 | | %% can have a significative overhead here.
984 | | %%-----------------------------------------------------------------
985 | | terminate_dynamic_children(State) ->
986 | 22 | Child = get_dynamic_child(State),
987 | 22 | Pids = dyn_fold(
988 | | fun
989 | | (P, Acc) when is_pid(P) ->
990 | 313 | Mon = monitor(process, P),
991 | 313 | case Child#child.shutdown of
992 | :-( | brutal_kill -> exit(P, kill);
993 | 313 | _ -> exit(P, shutdown)
994 | | end,
995 | 313 | Acc#{{P, Mon} => true};
996 | | (?restarting(_), Acc) ->
997 | 1 | Acc
998 | | end,
999 | | #{},
1000 | | State
1001 | | ),
1002 | 22 | TRef = case Child#child.shutdown of
1003 | | brutal_kill ->
1004 | 2 | undefined;
1005 | | infinity ->
1006 | 1 | undefined;
1007 | | Time ->
1008 | 19 | erlang:start_timer(Time, self(), kill)
1009 | | end,
1010 | 22 | Sz = maps:size(Pids),
1011 | 22 | EStack = wait_dynamic_children(Child, Pids, Sz, TRef, #{}),
1012 | | %% Unroll stacked errors and report them
1013 | 22 | maps:foreach(fun(Reason, Ls) ->
1014 | 1 | ?report_error(shutdown_error, Reason,
1015 | :-( | Child#child{pid=Ls}, State#state.name)
1016 | | end, EStack).
1017 | |
1018 | | wait_dynamic_children(_Child, _Pids, 0, undefined, EStack) ->
1019 | 4 | EStack;
1020 | | wait_dynamic_children(_Child, _Pids, 0, TRef, EStack) ->
1021 | | %% If the timer has expired before its cancellation, we must empty the
1022 | | %% mail-box of the 'timeout'-message.
1023 | 18 | _ = erlang:cancel_timer(TRef),
1024 | 18 | receive
1025 | | {timeout, TRef, kill} ->
1026 | :-( | EStack
1027 | | after 0 ->
1028 | 18 | EStack
1029 | | end;
1030 | | wait_dynamic_children(#child{shutdown=brutal_kill} = Child, Pids, Sz,
1031 | | TRef, EStack) ->
1032 | :-( | receive
1033 | | {'DOWN', Mon, process, Pid, Reason0}
1034 | | when is_map_key({Pid, Mon}, Pids) ->
1035 | :-( | case unlink_flush(Pid, Reason0) of
1036 | | killed ->
1037 | :-( | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1038 | | Sz-1, TRef, EStack);
1039 | |
1040 | | normal when not (?is_permanent(Child)) ->
1041 | :-( | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1042 | | Sz-1, TRef, EStack);
1043 | | Reason ->
1044 | :-( | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1045 | | Sz-1, TRef, maps_prepend(Reason, Pid,
1046 | | EStack))
1047 | | end
1048 | | end;
1049 | | wait_dynamic_children(Child, Pids, Sz, TRef, EStack) ->
1050 | 314 | receive
1051 | | {'DOWN', Mon, process, Pid, Reason0}
1052 | | when is_map_key({Pid, Mon}, Pids) ->
1053 | 313 | case unlink_flush(Pid, Reason0) of
1054 | | shutdown ->
1055 | 312 | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1056 | | Sz-1, TRef, EStack);
1057 | |
1058 | | {shutdown, _} ->
1059 | :-( | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1060 | | Sz-1, TRef, EStack);
1061 | |
1062 | | normal when not (?is_permanent(Child)) ->
1063 | :-( | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1064 | | Sz-1, TRef, EStack);
1065 | |
1066 | | Reason ->
1067 | 1 | wait_dynamic_children(Child, maps:remove({Pid, Mon}, Pids),
1068 | | Sz-1, TRef, maps_prepend(Reason, Pid,
1069 | | EStack))
1070 | | end;
1071 | |
1072 | | {timeout, TRef, kill} ->
1073 | 1 | maps:foreach(fun({P, _}, _) -> exit(P, kill) end, Pids),
1074 | 1 | wait_dynamic_children(Child, Pids, Sz, undefined, EStack)
1075 | | end.
1076 | |
1077 | | maps_prepend(Key, Value, Map) ->
1078 | 1 | case maps:find(Key, Map) of
1079 | | {ok, Values} ->
1080 | :-( | maps:put(Key, [Value\|Values], Map);
1081 | | error ->
1082 | 1 | maps:put(Key, [Value], Map)
1083 | | end.
1084 | |
@rickard-green ok, I'll see what |
@Maria-12648430 |
@rickard-green @Maria-12648430 I added some tests that cover the shutdown process, except for those cases in the |
Thanks 🤗 I'll take a closer look later. From what I see, you are using a child that takes some time to shut down to delay the supervisor and let other children exit by themselves? |
That's the idea, yes :) |
That doesn't work for |
I've tested this PR on SSH's tests after a tips by @IngelaAndin. Usually there are about 30 SUPERVISOR REPORTs due to a harmless hazard in the SSH supervisor tree. The sequence closing down the test cases triggers that hazard sometimes. With this PR all those reports vanish - I've tested three times! What about rebasing this PR on top of maint? It might also fix a couple of hard, non reproducible nor replicable issue reports in SSH too. Comments from the asignee @rickard-green (or anyone else of course) ? |
@Maria-12648430 @juhlig The test coverage looks much better now! @HansN wrote:
Rebasing on top of the |
Great 😄
Ok. Everybody (involved) agrees? |
From a patch perspective it is always better to base the fix on an as early normal version (green versions in the version tree) as possible, since it then can be merged forward to all later versions. If based on top of Note that you cannot base it on the top of It is however possible to get into some trouble if you base a fix on a version that is too early, with merge conflicts due to rewrites in the same area and functionality not introduced yet that you need, so you don't want to base it on a too early version. You won't get into any such trouble using the top of |
It should easily rebase on maint as @HansN already tried that. I guess normally we expect our Open Source user to base things on maint or master. But as you say Rickard it can be interesting to actually patch this earlier than OTP-24.3 and so maint-24 is then a good idea. I guess we will have to take care of the merges locally,not using the merge button here though! |
@IngelaAndin @rickard-green Okaaay... so what shall I do, and how? 😅 TBH, I only tried that once before and probably did it wrong since I had to sort out conflicts afterwards, so... is changing the base branch for this PR here (ie via edit -> "base: master" -> "base: maint-24") enough? |
I would stand on your branch and do git rebase ---onto maint-24 HEAD~N where N are the number of commits that you have. Then it will take maint-24 and try to apply your commits with maint-24 as the base. The github thing will only work under some circumstances when moving something forward. You can always save a copy of your branch somewhere as backup if you feel nervous, although git has ways to restore mistakes even if you do not. |
Children were monitored and unlinked before sending the shutdown exit signal. If the supervisor dies between the unlinking and sending the exit signal, the child becomes an orphan. For similar reasons, erratic error messages would occur in logs when a child exited by itself while the supervisor tries to shut it down. The changes in this commit keep children linked until they have exited, and treat exit reasons as normal which may occur when a child exits by itself while being shut down by the supervisor. Co-authored-by: Jan Uhlig <juhlig@hnc-agency.org>
c85b25e
to
ffd032a
Compare
Ok, thanks for the pointers @IngelaAndin, worked like a charm 😄. (Btw, this PR still says "Maria-12648430 wants to merge 1 commit into erlang:master from Maria-12648430:supervisor_unlink_child_late" up there ☝️ Don't know if it matters, just pointing it out) |
@Maria-12648430 I fetch:ed from github, and your branch is now based on maint-24 ( for the moment = OPT-24.1). |
Let's leave that to @rickard-green then 😉 Or do the merge locally (vs the merge button), as @IngelaAndin suggested. |
@Maria-12648430 @HansN @rickard-green I think we actually now should be able to change branch to maint with the edit as it now based on maint-24 and hence can cleanly be merged froward. I just realized that should enable us to use the merge button for the PR too and then reuse the branch in our patch script. (That is we want it to say maint up there and not maint-24 but the branch can be based on maint-24). |
Clarification: By messy merges I mean lots of merge conflicts when merging forward to |
So, how are things here? Any news from testing? @HansN @rickard-green |
I haven't looked closely at testing of this (been fully occupied with other bugfixes), but I haven't seen any obvious issues. @HansN has maybe paid closer attention to this? Unless I find something worrying I will include these changes in the next patch which probably will be released later this week. One thing left to do is to write a release note, so if you have a suggestion for that it would be nice :-) |
Ummm... not sure 😅 FWIW, this is what I put as commit message, maybe you can distill something from that? I can't think of anything better that describes both the problem and the solution...
|
I checked the results of the testing, and there are no problems with the new test cases, neither in maint, master or maint-24 (the coming patch). |
What about something like:
|
@HansN hm, I don't think that explains it very well... IMO it's misleading even.
There wasn't a race here, in the sense that I would use the word. What could happen was that the supervisor unlinked the child and then died before sending the shutdown signal, which in turn would result in an orphaned child. But if anything, the supervisor was only racing it's own death there, and it wasn't causing that race.
Now those were caused by a race between a supervisor shutting down a child and that child exiting by itself at the same time. I wouldn't call it a risk though. I mean, it wasn't anything dangerous 😉 Just puzzling and needlessly worrying maybe. |
... but I really have no idea what a good synonym for "bad side-effect of sudden death at a bad time" is 😬 |
"leaking children"? 😅😅😅 |
Yeah, children do that sometimes ^^;;; Fix: Diapers |
You're not exactly helpful 😝 |
I wasn't trying to be 😛 What about this?
And the other, how does this sound?
|
Sounds good to me, what do the others think? |
Sounds good! I guess that the part about the erratic supervisor reports cover the loss of the real exit reason resulting in a |
Yes 😄 |
@Maria-12648430 & @juhlig Nice work on this PR, thanks! Everything is now prepared for patch of this branch, including forward merge to |
Great 🤩 Always nice to work with you all 😃 |
I stumbled over a bug report from 2013 that is very similar to this, but not exactly. The bug report said that the exit reason of a process may be lost during a shutdown race if:
From what I can tell this PR fixes that problem as well. Possibly this race also got a lot less likely when we redid how process interactions work in erts to use signals instead of taking locks of the target process. Just thought I'd add this comment here so that anyone who goes looking for a fix to that issue will know where it was fixed. |
@garazdawi the That said, a peculiarity of |
Any reason why you didn't include We are still seeing the |
TBH, I don't remember 🤷 But I guess it was just an oversight, at least I don't see a reason right now. |
Hm, it is I'm pretty curious what feature you can (un-)break by (re-)naming a variable =^^= If you want to tell me about it, just PM me on Erlang Forums. But however that may be, I don't carry any special weight with OTP whatsoever, I was just co-author to this PR, nothing more. So I'd have to make another PR and explain the reasons, just like everyone else, ie like yourself 😉 |
This PR was prompted by this thread on the Erlang Questions ML, concerning unexpected
noproc
exit reasons due to a race condition when a supervisor child self-terminates and is at the same time terminated viasupervisor:terminate_child/2
. In the course of the discussion, it was discovered that there exists a condition where a child could become orphaned when its supervisor dies at a critical moment.In a nutshell, in the current implementation, the procedure of shutting down a child is as follows:
'EXIT'
message in case the child just died by itself'DOWN'
message'DOWN'
messageHowever, if the supervisor dies between (2) and (4), the child will be left running, unaware even that it has become an orphan.
This PR changes the procedure as follows:
'DOWN'
message, then unlink it and flush out the possible'EXIT'
message'DOWN'
message, then unlink it and flush out the possible'EXIT'
messageThis way, the child remains linked to the supervisor until it is definitely dead.
Flushing out the
'EXIT'
messages implicitly serves as a way to retrieve the child's exit reason (the'DOWN'
message might reach the child only after it has died earlier, ie too late, resulting in thenoproc
that prompted the initial post on the aforementioned thread). This is a best-effort approach (which comes for free), if the child unlinked itself from the supervisor ("naughty child"), there will be not'EXIT'
message.Additionally, this PR handles some exit reasons as normal (ie, not logging them as errors) which would be reported by the current implementation simply because the child exited at the wrong moment.