Permalink
Browse files

Be more resilient when reading pidfiles

Unfortunately, unicorn doesn't write its pidfile atomically, which means there
is a small but finite chance that during hot-reload, Unicorn Herder will read
the pidfile after the new master has opened it, but before it has
written/closed it. The result of this is that Unicorn Herder becomes detached
from the new master.

This commit changes that, by making Unicorn Herder attempt to read a pidfile
multiple times before giving up, provided it is not expecting the unicorn
process to exit.
  • Loading branch information...
1 parent d22c72e commit a89dc342a58e0b76ce6bac0654063a632deddbe0 @nickstenning nickstenning committed Aug 24, 2012
Showing with 69 additions and 49 deletions.
  1. +23 −13 test/test_herder.py
  2. +46 −36 unicornherder/herder.py
View
@@ -56,36 +56,46 @@ def test_spawn_unicorn_timeout(self, timeout_mock, popen_mock):
popen_mock.return_value.pid = -1
timeout_mock.side_effect = fake_timeout_fail
h = Herder()
- h._boot_loop = lambda: True
popen_mock.return_value.poll.return_value = None
ret = h.spawn()
assert_false(ret)
popen_mock.return_value.terminate.assert_called_once_with()
+ @patch('unicornherder.herder.time.sleep')
@patch('unicornherder.herder.psutil.Process')
@patch('%s.open' % builtin_mod)
- def test_loop_valid_pid(self, open_mock, process_mock):
+ def test_loop_valid_pid(self, open_mock, process_mock, sleep_mock):
open_mock.return_value.read.return_value = '123\n'
h = Herder()
ret = h._loop_inner()
- assert_equal(ret, 0)
+ assert_equal(ret, True)
process_mock.assert_called_once_with(123)
+ @patch('unicornherder.herder.time.sleep')
@patch('%s.open' % builtin_mod)
- def test_loop_invalid_pid(self, open_mock):
+ def test_loop_invalid_pid(self, open_mock, sleep_mock):
open_mock.return_value.read.return_value = 'foobar'
h = Herder()
- ret = h._loop_inner()
- assert_equal(ret, 1)
+ assert_raises(HerderError, h._loop_inner)
+ @patch('unicornherder.herder.time.sleep')
@patch('%s.open' % builtin_mod)
- def test_loop_nonexistent_pidfile(self, open_mock):
+ def test_loop_nonexistent_pidfile(self, open_mock, sleep_mock):
def _fail():
raise IOError()
open_mock.return_value.read.side_effect = _fail
h = Herder()
- ret = h._loop_inner()
- assert_equal(ret, 1)
+ assert_raises(HerderError, h._loop_inner)
+
+ @patch('unicornherder.herder.time.sleep')
+ @patch('%s.open' % builtin_mod)
+ def test_loop_nonexistent_pidfile_terminating(self, open_mock, sleep_mock):
+ def _fail():
+ raise IOError()
+ open_mock.return_value.read.side_effect = _fail
+ h = Herder()
+ h.terminating = True
+ assert_equal(h._loop_inner(), False)
@patch('unicornherder.herder.time.sleep')
@patch('unicornherder.herder.psutil.Process')
@@ -101,12 +111,12 @@ def test_loop_detects_pidchange(self, open_mock, process_mock, sleep_mock):
open_mock.return_value.read.return_value = '123\n'
process_mock.return_value = proc1
ret = h._loop_inner()
- assert_equal(ret, 0)
+ assert_equal(ret, True)
open_mock.return_value.read.return_value = '456\n'
process_mock.return_value = proc2
ret = h._loop_inner()
- assert_equal(ret, 0)
+ assert_equal(ret, True)
expected_calls = []
assert_equal(proc1.mock_calls, expected_calls)
@@ -126,15 +136,15 @@ def test_loop_reload_pidchange_signals(self, open_mock, process_mock,
open_mock.return_value.read.return_value = '123\n'
process_mock.return_value = proc1
ret = h._loop_inner()
- assert_equal(ret, 0)
+ assert_equal(ret, True)
# Simulate SIGHUP
h._handle_HUP(signal.SIGHUP, None)
open_mock.return_value.read.return_value = '456\n'
process_mock.return_value = proc2
ret = h._loop_inner()
- assert_equal(ret, 0)
+ assert_equal(ret, True)
expected_calls = [call.send_signal(signal.SIGUSR2),
call.send_signal(signal.SIGWINCH),
@@ -17,6 +17,8 @@
MANAGED_PIDS = set([])
+WORKER_WAIT = 120
+
class HerderError(Exception):
pass
@@ -53,6 +55,7 @@ def __init__(self, unicorn='gunicorn', pidfile=None, args=''):
self.master = None
self.reloading = False
+ self.terminating = False
def spawn(self):
"""
@@ -81,16 +84,6 @@ def spawn(self):
process.terminate()
return False
- try:
- with timeout(5):
- self._boot_loop()
- except TimeoutError:
- log.error('%s failed to write a pidfile within 5 seconds. Sending TERM '
- 'and exiting.', self.unicorn)
- if process.poll() is None:
- process.terminate()
- return False
-
# If we got this far, unicorn has daemonized, and we no longer need to
# worry about the original process.
MANAGED_PIDS.remove(process.pid)
@@ -113,25 +106,26 @@ def spawn(self):
def loop(self):
"""Enter the monitoring loop"""
while True:
- ret = self._loop_inner()
- if ret != 0:
+ if not self._loop_inner():
# The unicorn has died. So should we.
log.error('Unicorn died. Exiting.')
- return ret
+ return 1
time.sleep(2)
def _loop_inner(self):
old_master = self.master
pid = self._read_pidfile()
+ if pid is None:
+ return False
+
try:
self.master = psutil.Process(pid)
- except (psutil.NoSuchProcess, TypeError):
- return 1
+ except psutil.NoSuchProcess:
+ return False
if old_master is None:
- log.info('Unicorn booted (PID %s)',
- self.master.pid)
+ log.info('Unicorn booted (PID %s)', self.master.pid)
MANAGED_PIDS.add(self.master.pid)
@@ -150,33 +144,48 @@ def _loop_inner(self):
MANAGED_PIDS.remove(old_master.pid)
- return 0
-
- def _boot_loop(self):
- while True:
- if self._loop_inner() == 0:
- break
- time.sleep(1)
+ return True
def _read_pidfile(self):
- try:
- content = open(self.pidfile).read()
- except IOError:
- return None
-
- try:
- pid = int(content)
- except ValueError:
- return None
-
- return pid
+ for _ in range(5):
+ try:
+ content = open(self.pidfile).read()
+ except IOError as e:
+ # If we are expecting unicorn to die, then this is normal, and
+ # we can just return None, thus triggering a clean exit of the
+ # Herder.
+ if self.terminating:
+ return None
+ else:
+ log.debug('Got IOError while attempting to read pidfile: %s', e)
+ log.debug('This is usually not fatal. Retrying in a moment...')
+ time.sleep(1)
+ continue
+
+ try:
+ pid = int(content)
+ except ValueError as e:
+ log.debug('Got ValueError while reading pidfile. Is "%s" an integer? %s',
+ content, e)
+ log.debug('This is usually not fatal. Retrying in a moment...')
+ time.sleep(1)
+ continue
+
+ return pid
+
+ raise HerderError('Failed to read pidfile %s after 5 attempts, aborting!' %
+ self.pidfile)
def _handle_signal(self, name):
def _handler(signum, frame):
if self.master is None:
log.warn("Caught %s but have no tracked process.", name)
return
+ if signum in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
+ log.debug("Caught %s: expecting termination.", name)
+ self.terminating = True
+
log.debug("Forwarding %s to PID %s", name, self.master.pid)
self.master.send_signal(signum)
@@ -206,9 +215,10 @@ def _emergency_slaughter():
except:
pass
+
def _wait_for_workers(process):
# TODO: do something smarter here
- time.sleep(120)
+ time.sleep(WORKER_WAIT)
def _kill_old_master(process):

0 comments on commit a89dc34

Please sign in to comment.