-
-
Notifications
You must be signed in to change notification settings - Fork 5
fix(scheduler) Make schedule changes take effect immediately #590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,10 +36,10 @@ def __init__( | |
| self._redis = redis | ||
| self._metrics = metrics | ||
|
|
||
| def _make_key(self, taskname: str) -> str: | ||
| return f"tw:scheduler:{taskname}" | ||
| def _make_key(self, key: str) -> str: | ||
| return f"tw:scheduler:{key}" | ||
|
|
||
| def set(self, taskname: str, next_runtime: datetime) -> bool: | ||
| def set(self, key: str, next_runtime: datetime) -> bool: | ||
| """ | ||
| Record a spawn time for a task. | ||
| The next_runtime parameter indicates when the record should expire, | ||
|
|
@@ -51,37 +51,49 @@ def set(self, taskname: str, next_runtime: datetime) -> bool: | |
| # next_runtime & now could be the same second, and redis gets sad if ex=0 | ||
| duration = max(int((next_runtime - now).total_seconds()), 1) | ||
|
|
||
| result = self._redis.set(self._make_key(taskname), now.isoformat(), ex=duration, nx=True) | ||
| result = self._redis.set(self._make_key(key), now.isoformat(), ex=duration, nx=True) | ||
| return bool(result) | ||
|
|
||
| def read(self, taskname: str) -> datetime | None: | ||
| def read(self, key: str) -> datetime | None: | ||
| """ | ||
| Retrieve the last run time of a task | ||
| Returns None if last run time has expired or is unknown. | ||
| """ | ||
| result = self._redis.get(self._make_key(taskname)) | ||
| result = self._redis.get(self._make_key(key)) | ||
| if result: | ||
| return datetime.fromisoformat(result) | ||
|
|
||
| self._metrics.incr( | ||
| "taskworker.scheduler.run_storage.read.miss", tags={"taskname": taskname} | ||
| ) | ||
| self._metrics.incr("taskworker.scheduler.run_storage.read.miss", tags={"taskname": key}) | ||
| return None | ||
|
|
||
| def read_many(self, tasknames: list[str]) -> Mapping[str, datetime | None]: | ||
| def read_many( | ||
| self, | ||
| storage_keys: list[str], | ||
| ) -> Mapping[str, datetime | None]: | ||
| """ | ||
| Retreive last run times in bulk | ||
| Retrieve last run times in bulk. | ||
|
|
||
| storage_keys are the new-format keys including the schedule_id suffix | ||
| (e.g. "test:valid:300"). Falls back to the legacy key (derived by | ||
| stripping the suffix) when the new key has no data, allowing a seamless | ||
| first-deploy transition. | ||
|
|
||
| Returns a mapping keyed by storage_key. | ||
| """ | ||
| values = self._redis.mget([self._make_key(taskname) for taskname in tasknames]) | ||
| run_times = { | ||
| taskname: datetime.fromisoformat(value) if value else None | ||
| for taskname, value in zip(tasknames, values) | ||
| } | ||
| legacy_keys = [sk.rsplit(":", 1)[0] for sk in storage_keys] | ||
|
|
||
| new_values = self._redis.mget([self._make_key(sk) for sk in storage_keys]) | ||
| legacy_values = self._redis.mget([self._make_key(lk) for lk in legacy_keys]) | ||
|
|
||
| run_times: dict[str, datetime | None] = {} | ||
| for storage_key, new_val, legacy_val in zip(storage_keys, new_values, legacy_values): | ||
| raw = new_val if new_val is not None else legacy_val | ||
| run_times[storage_key] = datetime.fromisoformat(raw) if raw else None | ||
|
Comment on lines
+83
to
+91
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: The backwards-compatibility fallback for legacy run state can prevent a task from running immediately after a schedule change, delaying its execution until the new interval has passed. Suggested FixThe backwards-compatibility logic should be adjusted to ignore the legacy Prompt for AI AgentDid we get this right? 👍 / 👎 to inform future reviews. |
||
| return run_times | ||
|
|
||
| def delete(self, taskname: str) -> None: | ||
| def delete(self, key: str) -> None: | ||
| """remove a task key - mostly for testing.""" | ||
| self._redis.delete(self._make_key(taskname)) | ||
| self._redis.delete(self._make_key(key)) | ||
|
|
||
|
|
||
| class ScheduleEntry: | ||
|
|
@@ -112,6 +124,10 @@ def __repr__(self) -> str: | |
| def fullname(self) -> str: | ||
| return self._task.fullname | ||
|
|
||
| @property | ||
| def storage_key(self) -> str: | ||
| return f"{self.fullname}:{self._schedule.schedule_id()}" | ||
|
|
||
| @property | ||
| def namespace(self) -> str: | ||
| return self._task.namespace.name | ||
|
|
@@ -237,7 +253,7 @@ def tick(self) -> float: | |
| def _try_spawn(self, entry: ScheduleEntry) -> None: | ||
| now = datetime.now(tz=UTC) | ||
| next_runtime = entry.runtime_after(now) | ||
| if self._run_storage.set(entry.fullname, next_runtime): | ||
| if self._run_storage.set(entry.storage_key, next_runtime): | ||
| entry.delay_task() | ||
| entry.set_last_run(now) | ||
|
|
||
|
|
@@ -252,7 +268,7 @@ def _try_spawn(self, entry: ScheduleEntry) -> None: | |
| ) | ||
| else: | ||
| # We were not able to set a key, load last run from storage. | ||
| run_state = self._run_storage.read(entry.fullname) | ||
| run_state = self._run_storage.read(entry.storage_key) | ||
| entry.set_last_run(run_state) | ||
|
|
||
| logger.info( | ||
|
|
@@ -284,9 +300,9 @@ def _load_last_run(self) -> None: | |
| We synchronize each time the schedule set is modified and | ||
| then incrementally as tasks spawn attempts are made. | ||
| """ | ||
| last_run_times = self._run_storage.read_many([item.fullname for item in self._entries]) | ||
| last_run_times = self._run_storage.read_many([item.storage_key for item in self._entries]) | ||
| for item in self._entries: | ||
| last_run = last_run_times.get(item.fullname, None) | ||
| last_run = last_run_times.get(item.storage_key, None) | ||
| item.set_last_run(last_run) | ||
| logger.info( | ||
| "taskworker.scheduler.load_last_run", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a contrived example, but:
Is this scenario possible? And if so, does this result make sense?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understand correctly, a task has both a legacy + new key (
test:valandtest:val:300) and then the schedule is changed from 300 -> 3000? In that scenario we shouldn't have bothtest:val:300andtest:val:3000instorage_keysbecausetest:valis the scheduled task name, and task names would be unique.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But isn't it possible to create such a scenario anyways? For example, if the
schedulercommand has...... wouldn't this recreate the example Evan proposed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that scenario would trigger the problem. Currently having two schedule entries with the same task and different schedules results in only one of the schedules being followed because the storage key is shared.
We currently prevent tasks from having two different schedules in sentry with a test
https://github.com/getsentry/sentry/blob/41423c6cbfd603d2c63a7589d76c4f0dca2e6014/tests/sentry/taskworker/test_config.py#L30-L40
So the problematic scenario has been prevented for now. As we on-board more applications though we should make this more resilient to operator error. Using the schedule name as the storage key would help with that. I can follow up these changes with that improvement.