Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add health checks for server processes #227

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ RUN apt-get -y update && apt-get -y upgrade && \
apt-get -y update --fix-missing && \
apt-get -y install && apt-get -y upgrade && \
apt-get -y install software-properties-common && \
apt-get -y install curl && \
racheldaniel marked this conversation as resolved.
Show resolved Hide resolved
apt-get -y install iputils-ping &&\
apt-get -y install git libpq-dev openssh-client openssl && \
apt-get -y autoremove && \
rm -rf /var/lib/apt/lists/*
Expand Down
23 changes: 23 additions & 0 deletions bash/liveliness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

# First, make a call to the ready server endpoint. It should return 200
response=$(curl -s -o /dev/null -w "%{http_code}" localhost:8585/ready)

# Check if the response status code is 200
if [ "$response" -eq 200 ]; then
# Set default value for CELERY_BROKER_URL if not already set
CELERY_BROKER_URL=${CELERY_BROKER_URL:-redis://localhost:6379/0}

# Check if the broker is reachable
if redis-cli -u "$CELERY_BROKER_URL" PING > /dev/null 2>&1; then
# Check the time difference between the current timestamp and the last modification timestamp
# of the worker_heartbeat file. This will also eval to False if the file doesn't exist yet.
if test $(($(date +%s) - $(stat -c %Y /tmp/worker_heartbeat))) -lt 10; then
# Both checks passed, return 1
exit 1
fi
fi
fi

# Either of the checks failed, return 0
exit 0
41 changes: 40 additions & 1 deletion dbt_worker/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from celery import Celery
from pathlib import Path
from celery import Celery, bootsteps
from celery.signals import worker_ready, worker_shutdown
from dbt_server.flags import CELERY_BACKEND_URL
from dbt_server.flags import CELERY_BROKER_URL
from dbt_worker import celeryconfig
Expand All @@ -16,5 +18,42 @@

app.config_from_object(celeryconfig)

LIVENESS_FILE = Path("/tmp/worker_heartbeat")


class LivenessProbe(bootsteps.StartStopStep):
requires = {"celery.worker.components:Timer"}

def __init__(self, _, **kwargs):
self.requests = []
self.tref = None

def start(self, worker):
self.tref = worker.timer.call_repeatedly(
1.0,
self.update_heartbeat_file,
(worker,),
priority=10,
)

def stop(self, _):
LIVENESS_FILE.unlink(missing_ok=True)

def update_heartbeat_file(self, _):
LIVENESS_FILE.touch()


@worker_ready.connect
def worker_ready(**_):
LIVENESS_FILE.touch()

racheldaniel marked this conversation as resolved.
Show resolved Hide resolved

@worker_shutdown.connect
def worker_shutdown(**_):
LIVENESS_FILE.unlink(missing_ok=True)


app.steps["worker"].add(LivenessProbe)

if __name__ == "__main__":
app.start()
26 changes: 18 additions & 8 deletions dbt_worker/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from time import sleep
from time import sleep, time
from billiard.context import Process

import os
Expand Down Expand Up @@ -265,14 +265,24 @@ def _handle_abort(task_id, p, callback_url):
# Process is not alive, simply return.
if not p.is_alive():
return
try:
# Try to kill process using SIGINT.
# TODO: Send SIGTERM after a timeout-- there is
# a bug in core that sometimes makes SIGINT ineffective
os.kill(p.pid, signal.SIGINT)
except Exception as e:
logger.info(str(e))

# As of 05/2023 there is a bug in dbt-core that will cause occasional
# timeouts for KeyboardInterrupts.
# To mediate, wait for process to exit or retry killing it with SIGINT
retry_timeout = 5 # seconds
start_time = time()
while p.is_alive():
if time() - start_time >= retry_timeout:
break
try:
os.kill(p.pid, signal.SIGINT)
except Exception as e:
logger.info(str(e))
sleep(1)

# If the process is still alive, send a SIGKILL signal to force it to exit.
if p.is_alive():
os.kill(p.pid, signal.SIGKILL)
try:
if callback_url:
_send_state_callback(callback_url, task_id, ABORTED)
Expand Down
Loading