Skip to content
Browse files

Allow processes to idle before exiting

When system load exceeds the ability of os_process_soft_limit to keep
up with demand we enter a fork-use-kill (FUK) cycle. The constant
spawning and destruction os these processes thrashes system resources
and causes general instability.

This patch changes the behavior from killing each process as its
returned to letting it idle for a configurable amount of time (default
five minutes) which allows it to be reused by other clients. This way we
can avoid adding unnecessary load when demand for couchjs processes
exceeds os_process_soft_limit.

As a happy benefit this should also allow os_process_soft_limit to be
set much lower since the number of processes will now more closely
follow actual demand (instead of provisioning for the worst case
scenario).

Conflicts:

	apps/couch/src/couch_os_process.erl
	apps/couch/src/couch_proc_manager.erl

Conflicts:
	apps/couch/src/couch_os_process.erl
  • Loading branch information...
1 parent 90c656f commit b9f0271cf63efed0af5c3c4aec5a31c928da1079 @davisp davisp committed with Robert Newson Dec 3, 2011
Showing with 42 additions and 26 deletions.
  1. +24 −13 apps/couch/src/couch_os_process.erl
  2. +18 −13 apps/couch/src/couch_proc_manager.erl
View
37 apps/couch/src/couch_os_process.erl
@@ -27,7 +27,8 @@
port,
writer,
reader,
- timeout=5000
+ timeout=5000,
+ idle
}).
start_link(Command) ->
@@ -104,12 +105,15 @@ readjson(#os_proc{} = OsProc) ->
% gen_server API
init([Command, Options, PortOptions]) ->
+ V = couch_config:get("query_server_config", "os_process_idle_limit", "300"),
+ IdleLimit = list_to_integer(V) * 1000,
Spawnkiller = filename:join([code:priv_dir(couch), "couchspawnkillable.sh"]),
BaseProc = #os_proc{
command=Command,
port=open_port({spawn, Spawnkiller ++ " " ++ Command}, PortOptions),
writer=fun writejson/2,
- reader=fun readjson/1
+ reader=fun readjson/1,
+ idle=IdleLimit
},
KillCmd = readline(BaseProc),
Pid = self(),
@@ -131,49 +135,56 @@ init([Command, Options, PortOptions]) ->
Proc#os_proc{timeout=TimeOut}
end
end, BaseProc, Options),
- {ok, OsProc}.
+ {ok, OsProc, IdleLimit}.
terminate(_Reason, #os_proc{port=Port}) ->
catch port_close(Port),
ok.
-handle_call({set_timeout, TimeOut}, _From, OsProc) ->
- {reply, ok, OsProc#os_proc{timeout=TimeOut}};
-handle_call({prompt, Data}, _From, OsProc) ->
+handle_call({set_timeout, TimeOut}, _From, #os_proc{idle=Idle}=OsProc) ->
+ {reply, ok, OsProc#os_proc{timeout=TimeOut}, Idle};
+handle_call({prompt, Data}, _From, #os_proc{idle=Idle}=OsProc) ->
#os_proc{writer=Writer, reader=Reader} = OsProc,
try
Writer(OsProc, Data),
- {reply, {ok, Reader(OsProc)}, OsProc}
+ {reply, {ok, Reader(OsProc)}, OsProc, Idle}
catch
throw:{error, OsError} ->
- {reply, OsError, OsProc};
+ {reply, OsError, OsProc, Idle};
throw:{fatal, OsError} ->
{stop, normal, OsError, OsProc};
throw:OtherError ->
{stop, normal, OtherError, OsProc}
end.
-handle_cast({send, Data}, #os_proc{writer=Writer}=OsProc) ->
+handle_cast({send, Data}, #os_proc{writer=Writer, idle=Idle}=OsProc) ->
try
Writer(OsProc, Data),
- {noreply, OsProc}
+ {noreply, OsProc, Idle}
catch
throw:OsError ->
?LOG_ERROR("Failed sending data: ~p -> ~p", [Data, OsError]),
{stop, normal, OsProc}
end;
handle_cast(stop, OsProc) ->
{stop, normal, OsProc};
-handle_cast(Msg, OsProc) ->
+handle_cast(Msg, #os_proc{idle=Idle}=OsProc) ->
?LOG_DEBUG("OS Proc: Unknown cast: ~p", [Msg]),
- {noreply, OsProc}.
+ {noreply, OsProc, Idle}.
+handle_info(timeout, #os_proc{idle=Idle}=OsProc) ->
+ gen_server:cast(couch_proc_manager, {os_proc_idle, self()}),
+ erlang:garbage_collect(),
+ {noreply, OsProc, Idle};
handle_info({Port, {exit_status, 0}}, #os_proc{port=Port}=OsProc) ->
?LOG_INFO("OS Process terminated normally", []),
{stop, normal, OsProc};
handle_info({Port, {exit_status, Status}}, #os_proc{port=Port}=OsProc) ->
?LOG_ERROR("OS Process died with status: ~p", [Status]),
- {stop, {exit_status, Status}, OsProc}.
+ {stop, {exit_status, Status}, OsProc};
+handle_info(Msg, #os_proc{idle=Idle}=OsProc) ->
+ ?LOG_DEBUG("OS Proc: Unknown info: ~p", [Msg]),
+ {noreply, OsProc, Idle}.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
View
31 apps/couch/src/couch_proc_manager.erl
@@ -67,13 +67,29 @@ handle_call({ret_proc, #proc{client=Ref, pid=Pid} = Proc}, _From, State) ->
% #proc{} from our own table, so the alternative is to do a lookup in the
% table before the insert. Don't know which approach is cheaper.
case is_process_alive(Pid) of true ->
- maybe_reuse_proc(State#state.tab, Proc);
+ ets:insert(State#state.tab, Proc);
false -> ok end,
{reply, true, State};
handle_call(_Call, _From, State) ->
{reply, ignored, State}.
+handle_cast({os_proc_idle, Pid}, #state{tab=Tab}=State) ->
+ Limit = couch_config:get("query_server_config", "os_process_soft_limit", "100"),
+ case ets:info(Tab, size) > list_to_integer(Limit) of
+ true ->
+ ets:delete(Tab, Pid),
+ case is_process_alive(Pid) of
+ true ->
+ unlink(Pid),
+ gen_server:cast(Pid, stop);
+ _ ->
+ ok
+ end;
+ _ ->
+ ok
+ end,
+ {noreply, State};
handle_cast(_Msg, State) ->
{noreply, State}.
@@ -95,7 +111,7 @@ handle_info({'DOWN', Ref, _, _, _Reason}, State) ->
ok;
[#proc{pid = Pid} = Proc] ->
case is_process_alive(Pid) of true ->
- maybe_reuse_proc(State#state.tab, Proc);
+ ets:insert(State#state.tab, Proc);
false -> ok end
end,
{noreply, State};
@@ -110,17 +126,6 @@ terminate(_Reason, #state{tab=Tab}) ->
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
-maybe_reuse_proc(Tab, #proc{pid = Pid} = Proc) ->
- Limit = couch_config:get("query_server_config", "os_process_soft_limit", "100"),
- case ets:info(Tab, size) > list_to_integer(Limit) of
- true ->
- ets:delete(Tab, Pid),
- unlink(Pid),
- exit(Pid, kill);
- false ->
- garbage_collect(Pid),
- ets:insert(Tab, Proc#proc{client=nil})
- end.
get_procs(Tab, Lang) when is_binary(Lang) ->
get_procs(Tab, binary_to_list(Lang));

0 comments on commit b9f0271

Please sign in to comment.
Something went wrong with that request. Please try again.