Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,87 @@ src/firetower/ # Django backend
frontend/ # React frontend
sdk/ # Python SDK
```

## Scheduled Tasks

Scheduled tasks are stored as database objects and are managed via migrations.

### Adding a Task

First, define you task in the `SCHEDULES` map in `src/firetower/incidents/tasks.py`.

Then, create a new migration referencing it:

```python
from django.db import migrations

from firetower.incidents.tasks import SCHEDULES


def create_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "[schedule name goes here]"
Schedule.objects.get_or_create(
name=schedule_name, defaults=SCHEDULES[schedule_name]
)


def delete_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "schedule_demo"
Schedule.objects.filter(name=schedule_name).delete()


class Migration(migrations.Migration):
dependencies = [
("incidents", "[previous migration goes here]"),
]

operations = [
migrations.RunPython(create_schedule, delete_schedule),
]
```

### Removing a task

First, generate the opposite of the migration used to add the schedule:

```python
from django.db import migrations

from firetower.incidents.tasks import SCHEDULES


def create_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "[schedule name goes here]"
Schedule.objects.get_or_create(
name=schedule_name, defaults=SCHEDULES[schedule_name]
)


def delete_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "schedule_demo"
Schedule.objects.filter(name=schedule_name).delete()


class Migration(migrations.Migration):
dependencies = [
("incidents", "[previous migration goes here]"),
]

operations = [
migrations.RunPython(delete_schedule, create_schedule),
]
```

Once this migration has run everywhere, you can then remove the body from the `SCHEDULES` array, but keep the key since these are still referenced in legacy migrations!

```python
SCHEDULES = {
"schedule_demo": {
# Removed in <migration name>
},
}
```
16 changes: 9 additions & 7 deletions docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ set -e
set -u
set -x

export PYTHONPATH="/app:${PYTHONPATH-}"

if [ z"$1" = "zmigrate" ]; then
COMMAND="/app/.venv/bin/django-admin migrate --settings firetower.settings"
/app/.venv/bin/ddtrace-run /app/.venv/bin/django-admin createcachetable --settings firetower.settings
exec /app/.venv/bin/ddtrace-run /app/.venv/bin/django-admin migrate --settings firetower.settings
Comment thread
taylor-osler-sentry marked this conversation as resolved.
elif [ z"$1" = "zserver" ]; then
COMMAND="/app/.venv/bin/granian --interface wsgi --host 0.0.0.0 --port $PORT firetower.wsgi:application"
exec /app/.venv/bin/ddtrace-run /app/.venv/bin/granian --interface wsgi --host 0.0.0.0 --port "${PORT}" firetower.wsgi:application
elif [ z"$1" = "zslack-bot" ]; then
COMMAND="/app/.venv/bin/django-admin run_slack_bot --settings firetower.settings"
exec /app/.venv/bin/ddtrace-run /app/.venv/bin/django-admin run_slack_bot --settings firetower.settings
elif [ z"$1" = "zworker" ]; then
exec /app/.venv/bin/ddtrace-run /app/.venv/bin/django-admin wrapped_worker --settings firetower.settings
else
echo "Usage: $0 (migrate|server|slack-bot)"
echo "Usage: $0 (migrate|server|slack-bot|worker)"
exit 1
fi

export PYTHONPATH=/app:\$PYTHONPATH
/app/.venv/bin/ddtrace-run $COMMAND
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"pyserde[toml]>=0.28.0",
"notion-client>=3.0.0,<4.0.0",
"requests>=2.32.0",
"django-q2>=1.7.4",
"slack-bolt>=1.27.0",
"slack-sdk>=3.31.0",
"sentry-sdk>=2.47.0",
Expand All @@ -41,6 +42,7 @@ dev = [
"ty>=0.0.1a19",
"django-stubs>=5.2.7",
"django-stubs-ext>=5.2.7",
"blessed>=1.39.0",
]
prod = [
"granian>=2.6.0",
Expand Down
2 changes: 1 addition & 1 deletion src/firetower/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def __init__(self) -> None:
self.statuspage = None
self.project_key = ""
self.firetower_base_url = ""
self.django_secret_key = ""
self.django_secret_key = "dummy_value_DO_NOT_USE"
self.salt_key = ""
self.sentry_dsn = ""
self.region_grouping: list[list[str]] = []
Expand Down
Empty file.
Empty file.
133 changes: 133 additions & 0 deletions src/firetower/incidents/management/commands/wrapped_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import logging
Comment thread
taylor-osler-sentry marked this conversation as resolved.
import os
import re
import shutil
import signal
import subprocess
import sys
import threading
from http.server import BaseHTTPRequestHandler, HTTPServer
from typing import Any

from django.core.management.base import BaseCommand
from django_q.conf import Conf
from django_q.humanhash import humanize
from django_q.status import Stat

logger = logging.getLogger(__name__)

_CLUSTER_NAME_RE = re.compile(r"Q Cluster (\S+) starting\.")

_state: dict[str, Any] = {}
_shutdown = threading.Event()


class _HealthHandler(BaseHTTPRequestHandler):
def do_GET(self) -> None:
cluster_name = _state.get("cluster_name")

if not cluster_name:
self._respond(503, "cluster not yet started", None)
return

# TODO: this is awkward. Because the output is "humanized" we can't do a simple query.
# TODO: is there maybe some way to un-humanize?
target = next(
(s for s in Stat.get_all() if humanize(s.cluster_id.hex) == cluster_name),
None,
)

if target is None:
self._respond(503, cluster_name, "not found or still starting")
elif target.status in (Conf.IDLE, Conf.WORKING):
self._respond(200, cluster_name, target.status)
else:
Comment thread
sentry-warden[bot] marked this conversation as resolved.
status = target.status
self._respond(500, cluster_name, status)

def _respond(self, code: int, cluster_name: str, status: Any) -> None:
self.send_response(code)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(
(
"<html><head><title>Django-Q Health Check</title></head>"
f"<body><p>Health check returned {code} response</p>"
f"<p>Cluster {cluster_name} status: {status}</p></body></html>"
).encode()
)
Comment thread
taylor-osler-sentry marked this conversation as resolved.

def log_message(self, format: str, *args: Any) -> None:
pass


def _start_health_server() -> HTTPServer:
port = int(os.environ.get("PORT", "8080"))
server = HTTPServer(("0.0.0.0", port), _HealthHandler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
logger.info("Health check server listening on port %d", port)
Comment thread
sentry-warden[bot] marked this conversation as resolved.
return server


def _handle_shutdown(signum: int, frame: Any) -> None:
logger.info("Received signal %d, shutting down", signum)
_shutdown.set()


class Command(BaseCommand):
help = "Run a Q cluster subprocess wrapped with an HTTP health check server."

def handle(self, *args: Any, **options: Any) -> None:
_shutdown.clear()
_state.clear()
signal.signal(signal.SIGTERM, _handle_shutdown)
signal.signal(signal.SIGINT, _handle_shutdown)

server = _start_health_server()

django_admin = shutil.which("django-admin")
if django_admin is None or django_admin == "":
Comment thread
taylor-osler-sentry marked this conversation as resolved.
django_admin = "/app/.venv/bin/django-admin"

proc = None
try:
proc = subprocess.Popen(
[django_admin, "qcluster", "--settings", "firetower.settings"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)

def _pump_output() -> None:
assert proc.stdout is not None
for line in proc.stdout:
sys.stdout.write(line)
sys.stdout.flush()
if "cluster_name" not in _state:
match = _CLUSTER_NAME_RE.search(line)
if match:
_state["cluster_name"] = match.group(1)
logger.info("Detected cluster name: %s", match.group(1))

pump_thread = threading.Thread(target=_pump_output, daemon=True)
pump_thread.start()

while not _shutdown.is_set():
if proc.poll() is not None:
logger.warning(
"qcluster subprocess exited with code %s", proc.returncode
)
break
_shutdown.wait(timeout=1)

Check warning on line 123 in src/firetower/incidents/management/commands/wrapped_worker.py

View workflow job for this annotation

GitHub Actions / warden: code-review

wrapped_worker exits 0 even when qcluster subprocess fails

When the qcluster subprocess exits with a non-zero return code, the command logs a warning and breaks out of the loop, but `handle` returns normally so the management command exits with status 0. Container orchestrators (Docker/K8s) will not detect the failure and will not restart the worker, leaving the health endpoint as the only signal—but the HTTP server is also shut down on exit, so external monitors may not catch the crash promptly.
finally:
Comment thread
sentry-warden[bot] marked this conversation as resolved.
server.shutdown()
server.server_close()

Check failure on line 126 in src/firetower/incidents/management/commands/wrapped_worker.py

View workflow job for this annotation

GitHub Actions / warden: find-bugs

wrapped_worker exits 0 when qcluster subprocess crashes

When the qcluster subprocess exits unexpectedly (non-zero return code), the command logs a warning and breaks out of the loop, but `handle` returns normally so the management command exits with status 0. Container orchestrators (Kubernetes, Docker) rely on a non-zero exit code to detect failures and restart the worker. As written, a crashed worker will appear healthy to the orchestrator and will not be restarted, silently halting background job processing.
if proc and proc.poll() is None:
proc.terminate()

Check warning on line 128 in src/firetower/incidents/management/commands/wrapped_worker.py

View check run for this annotation

@sentry/warden / warden: code-review

Process exits 0 when qcluster dies

If the qcluster subprocess exits unexpectedly, the loop breaks and the command returns normally (exit code 0), so orchestrators (Kubernetes, etc.) will not restart the worker. Propagate `proc.returncode` via `sys.exit` so failures are visible to the supervisor.
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait()
Comment thread
taylor-osler-sentry marked this conversation as resolved.
28 changes: 28 additions & 0 deletions src/firetower/incidents/migrations/0016_schedule_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from django.db import migrations

from firetower.incidents.tasks import SCHEDULES
Comment thread
taylor-osler-sentry marked this conversation as resolved.


def create_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "schedule_demo"
Schedule.objects.get_or_create(
name=schedule_name, defaults=SCHEDULES[schedule_name]
)
Comment thread
taylor-osler-sentry marked this conversation as resolved.


def delete_schedule(apps, schema_editor):
Schedule = apps.get_model("django_q", "Schedule")
schedule_name = "schedule_demo"
Schedule.objects.filter(name=schedule_name).delete()


class Migration(migrations.Migration):
dependencies = [
("incidents", "0015_add_notion_troubleshooting_link_type"),
("django_q", "0018_task_success_index"),
]

operations = [
migrations.RunPython(create_schedule, delete_schedule),
]
60 changes: 60 additions & 0 deletions src/firetower/incidents/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import functools
import logging
import re
from typing import Protocol

from datadog import statsd
from django_q.tasks import Schedule

from firetower.incidents.models import Incident

SCHEDULES = {
"schedule_demo": {
"func": "firetower.incidents.tasks.schedule_demo",
"schedule_type": Schedule.MINUTES, # Minutes
Comment thread
github-actions[bot] marked this conversation as resolved.
Comment thread
sentry[bot] marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.
"minutes": 5,
"repeats": -1, # repeat indefinitely
},
}

DATADOG_INVALID_CHARS = re.compile(r"[^A-Za-z0-9-_.\/]")


logger = logging.getLogger(__name__)


class NamedFunction(Protocol):
__name__: str

def __call__(self) -> None:
pass


def datadog_log(f: NamedFunction) -> NamedFunction:
task_name: str = DATADOG_INVALID_CHARS.sub("_", f.__name__)
tags = [f"task:{task_name}"]

@functools.wraps(f)
def wrapper() -> None:
statsd.increment("django_q.task.run", 1, tags)
try:
f()
except Exception as e:
statsd.increment("django_q.task.error", 1, tags)
logger.error(
f"Error while executing task '{task_name}': {e}", exc_info=True
)
Comment thread
sentry-warden[bot] marked this conversation as resolved.
Comment thread
sentry[bot] marked this conversation as resolved.
raise e
Comment thread
cursor[bot] marked this conversation as resolved.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise e corrupts traceback in decorator wrapper

Low Severity

Using raise e instead of bare raise in the datadog_log exception handler adds an extra frame to the traceback, making it appear the exception originated from the decorator wrapper rather than the actual task function. This makes debugging task failures harder since the traceback points to the wrong location.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 1cca0d2. Configure here.

else:
statsd.increment("django_q.task.success", 1, tags)

return wrapper


@datadog_log
def schedule_demo() -> None:
incident = Incident.objects.order_by("-created_at").first()
if incident:
logger.info(f"Most recent incident: INC-{incident.id}: {incident.title}")
else:
logger.info("No incidents found.")
Loading
Loading