This repository has been archived by the owner on May 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
agent: Make the agent the subreaper of all processes
Our agent had a flaw, it potentially lives with some zombie processes according to what has been done by the container user. The explanation is that we only wait for container and exec processes but we don't handle the children and grandchildren processes coming from those initial container and exec processes. The example of such a case can be shown demonstrated in case the container process spawn a child process, and this one will also spawn another child process. In case the child terminates before its own children, those ones expect to be reaped by the init process of the PID namespace, i.e the container process. But this container process is not written to reap descendant processes, which will lead to a zombie process left behind. This commit solves those cases by setting the agent process, who will be the father of all container processes, as a subreaper. This way, it will receive all the SIGCHLD signals from any child left behind, and will reap them. Notice this patch makes sure we don't check the error from the libcontainer call to process.Wait() since it won't be able to reap the container or exec processes as expected. This libcontainer Wait() function is only used here to properly cleanup the pipes and internal structures created by libcontainer. Fixes #177 Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
- Loading branch information
Sebastien Boeuf
committed
Dec 18, 2017
1 parent
e8d8a5f
commit c36a44a
Showing
2 changed files
with
262 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
// | ||
// Copyright (c) 2017 Intel Corporation | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
// | ||
|
||
package main | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"os/exec" | ||
"sync" | ||
|
||
"github.com/opencontainers/runc/libcontainer" | ||
"golang.org/x/sys/unix" | ||
) | ||
|
||
type reaper struct { | ||
sync.RWMutex | ||
|
||
chansLock sync.RWMutex | ||
exitCodeChans map[int]chan int | ||
} | ||
|
||
func exitStatus(status unix.WaitStatus) int { | ||
if status.Signaled() { | ||
return exitSignalOffset + int(status.Signal()) | ||
} | ||
|
||
return status.ExitStatus() | ||
} | ||
|
||
func (r *reaper) getExitCodeCh(pid int) (chan int, error) { | ||
r.chansLock.RLock() | ||
defer r.chansLock.RUnlock() | ||
|
||
exitCodeCh, exist := r.exitCodeChans[pid] | ||
if !exist { | ||
return nil, fmt.Errorf("Process %d not found", pid) | ||
} | ||
|
||
return exitCodeCh, nil | ||
} | ||
|
||
func (r *reaper) setExitCodeCh(pid int, exitCodeCh chan int) { | ||
r.chansLock.Lock() | ||
defer r.chansLock.Unlock() | ||
|
||
r.exitCodeChans[pid] = exitCodeCh | ||
} | ||
|
||
func (r *reaper) deleteExitCodeCh(pid int) { | ||
r.chansLock.Lock() | ||
defer r.chansLock.Unlock() | ||
|
||
exitCodeCh, exist := r.exitCodeChans[pid] | ||
if !exist { | ||
return | ||
} | ||
|
||
close(exitCodeCh) | ||
|
||
delete(r.exitCodeChans, pid) | ||
} | ||
|
||
func (r *reaper) reap() error { | ||
var ( | ||
ws unix.WaitStatus | ||
rus unix.Rusage | ||
) | ||
|
||
// When running new processes, libcontainer expects to wait | ||
// for the first process actually spawning the container. | ||
// This lock allows any code starting a new process to take | ||
// the lock prior to the start of this new process, preventing | ||
// the subreaper from reaping unwanted processes. | ||
r.Lock() | ||
defer r.Unlock() | ||
|
||
for { | ||
pid, err := unix.Wait4(-1, &ws, unix.WNOHANG, &rus) | ||
if err != nil { | ||
if err == unix.ECHILD { | ||
return nil | ||
} | ||
|
||
return err | ||
} | ||
if pid < 1 { | ||
return nil | ||
} | ||
|
||
status := exitStatus(ws) | ||
|
||
agentLog.Infof("SIGCHLD pid %d, status %d", pid, status) | ||
|
||
exitCodeCh, err := r.getExitCodeCh(pid) | ||
if err != nil { | ||
// No need to signal a process with no channel | ||
// associated. When a process has not been registered, | ||
// this means the spawner does not expect to get the | ||
// exit code from this process. | ||
continue | ||
} | ||
|
||
// Here, we have to signal the routine listening on | ||
// this channel so that it can complete the cleanup | ||
// of the process and return the exit code to the | ||
// caller of WaitProcess(). | ||
exitCodeCh <- status | ||
} | ||
} | ||
|
||
// start starts the exec command and registers the process to the reaper. | ||
// This function is a helper for exec.Cmd.Start() since this needs to be | ||
// in sync with exec.Cmd.Wait(). | ||
func (r *reaper) start(c *exec.Cmd) error { | ||
// This lock is very important to avoid any race with reaper.reap(). | ||
// We don't want the reaper to reap a process before we have added | ||
// it to the exit code channel list. | ||
r.RLock() | ||
defer r.RUnlock() | ||
|
||
if err := c.Start(); err != nil { | ||
return err | ||
} | ||
|
||
// This channel is buffered so that reaper.reap() will not | ||
// block until reaper.wait() listen onto this channel. | ||
r.setExitCodeCh(c.Process.Pid, make(chan int, 1)) | ||
|
||
return nil | ||
} | ||
|
||
// wait blocks until the expected process has been reaped. After the reaping | ||
// from the subreaper, the exit code is sent through the provided channel. | ||
// This function is a helper for exec.Cmd.Wait() and os.Process.Wait() since | ||
// both cannot be used directly, because of the subreaper. | ||
func (r *reaper) wait(pid int, proc waitProcess) (int, error) { | ||
exitCodeCh, err := r.getExitCodeCh(pid) | ||
if err != nil { | ||
return -1, err | ||
} | ||
|
||
// Wait for the subreaper to receive the SIGCHLD signal. Once it gets | ||
// it, this channel will be notified by receiving the exit code of the | ||
// corresponding process. | ||
exitCode := <-exitCodeCh | ||
|
||
// Ignore errors since the process has already been reaped by the | ||
// subreaping loop. This call is only used to make sure libcontainer | ||
// properly cleans up its internal structures and pipes. | ||
proc.wait() | ||
|
||
r.deleteExitCodeCh(pid) | ||
|
||
return exitCode, nil | ||
} | ||
|
||
type waitProcess interface { | ||
wait() | ||
} | ||
|
||
type reaperOSProcess os.Process | ||
|
||
func (p *reaperOSProcess) wait() { | ||
(*os.Process)(p).Wait() | ||
} | ||
|
||
type reaperExecCmd exec.Cmd | ||
|
||
func (c *reaperExecCmd) wait() { | ||
(*exec.Cmd)(c).Wait() | ||
} | ||
|
||
type reaperLibcontainerProcess libcontainer.Process | ||
|
||
func (p *reaperLibcontainerProcess) wait() { | ||
(*libcontainer.Process)(p).Wait() | ||
} |