/
schedule_definition.py
540 lines (448 loc) · 22.1 KB
/
schedule_definition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
import copy
from contextlib import ExitStack
from datetime import datetime
from enum import Enum
from typing import Any, Callable, Dict, Iterator, List, NamedTuple, Optional, TypeVar, Union, cast
import pendulum
from typing_extensions import TypeGuard
import dagster._check as check
from ...serdes import whitelist_for_serdes
from ...utils import ensure_gen, merge_dicts
from ...utils.schedules import is_valid_cron_string
from ..decorator_utils import get_function_params
from ..errors import (
DagsterInvalidDefinitionError,
DagsterInvalidInvocationError,
DagsterInvariantViolationError,
ScheduleExecutionError,
user_code_error_boundary,
)
from ..instance import DagsterInstance
from ..instance.ref import InstanceRef
from ..storage.pipeline_run import PipelineRun
from .graph_definition import GraphDefinition
from .mode import DEFAULT_MODE_NAME
from .pipeline_definition import PipelineDefinition
from .run_request import RunRequest, SkipReason
from .target import DirectTarget, RepoRelativeTarget
from .unresolved_asset_job_definition import UnresolvedAssetJobDefinition
from .utils import check_valid_name, validate_tags
T = TypeVar("T")
@whitelist_for_serdes
class DefaultScheduleStatus(Enum):
RUNNING = "RUNNING"
STOPPED = "STOPPED"
class ScheduleEvaluationContext:
"""Schedule-specific execution context.
An instance of this class is made available as the first argument to various ScheduleDefinition
functions. It is passed as the first argument to ``run_config_fn``, ``tags_fn``,
and ``should_execute``.
Attributes:
instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule
scheduled_execution_time (datetime):
The time in which the execution was scheduled to happen. May differ slightly
from both the actual execution time and the time at which the run config is computed.
Not available in all schedulers - currently only set in deployments using
DagsterDaemonScheduler.
"""
__slots__ = ["_instance_ref", "_scheduled_execution_time", "_exit_stack", "_instance"]
def __init__(
self, instance_ref: Optional[InstanceRef], scheduled_execution_time: Optional[datetime]
):
self._exit_stack = ExitStack()
self._instance = None
self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)
self._scheduled_execution_time = check.opt_inst_param(
scheduled_execution_time, "scheduled_execution_time", datetime
)
def __enter__(self):
return self
def __exit__(self, _exception_type, _exception_value, _traceback):
self._exit_stack.close()
@property
def instance(self) -> "DagsterInstance":
# self._instance_ref should only ever be None when this ScheduleEvaluationContext was
# constructed under test.
if not self._instance_ref:
raise DagsterInvariantViolationError(
"Attempted to initialize dagster instance, but no instance reference was provided."
)
if not self._instance:
self._instance = self._exit_stack.enter_context(
DagsterInstance.from_ref(self._instance_ref)
)
return cast(DagsterInstance, self._instance)
@property
def scheduled_execution_time(self) -> Optional[datetime]:
return self._scheduled_execution_time
# Preserve ScheduleExecutionContext for backcompat so type annotations don't break.
ScheduleExecutionContext = ScheduleEvaluationContext
RunConfig = Dict[str, Any]
RunRequestIterator = Iterator[Union[RunRequest, SkipReason]]
ScheduleEvaluationFunctionReturn = Union[
RunRequest, SkipReason, RunConfig, RunRequestIterator, List[RunRequest]
]
RawScheduleEvaluationFunction = Union[
Callable[[ScheduleEvaluationContext], ScheduleEvaluationFunctionReturn],
Callable[[], ScheduleEvaluationFunctionReturn],
]
RunConfigEvaluationFunction = Union[
Callable[[ScheduleEvaluationContext], RunConfig],
Callable[[], RunConfig],
]
class DecoratedScheduleFunction(NamedTuple):
"""Wrapper around the decorated schedule function. Keeps track of both to better support the
optimal return value for direct invocation of the evaluation function"""
decorated_fn: RawScheduleEvaluationFunction
wrapped_fn: Callable[[ScheduleEvaluationContext], RunRequestIterator]
has_context_arg: bool
def is_context_provided(
fn: Union[Callable[[ScheduleEvaluationContext], T], Callable[[], T]]
) -> TypeGuard[Callable[[ScheduleEvaluationContext], T]]:
return len(get_function_params(fn)) == 1
def build_schedule_context(
instance: Optional[DagsterInstance] = None, scheduled_execution_time: Optional[datetime] = None
) -> ScheduleEvaluationContext:
"""Builds schedule execution context using the provided parameters.
The instance provided to ``build_schedule_context`` must be persistent;
DagsterInstance.ephemeral() will result in an error.
Args:
instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.
scheduled_execution_time (datetime): The time in which the execution was scheduled to
happen. May differ slightly from both the actual execution time and the time at which
the run config is computed.
Examples:
.. code-block:: python
context = build_schedule_context(instance)
daily_schedule.evaluate_tick(context)
"""
check.opt_inst_param(instance, "instance", DagsterInstance)
return ScheduleEvaluationContext(
instance_ref=instance.get_ref() if instance and instance.is_persistent else None,
scheduled_execution_time=check.opt_inst_param(
scheduled_execution_time, "scheduled_execution_time", datetime
),
)
@whitelist_for_serdes
class ScheduleExecutionData(NamedTuple):
run_requests: Optional[List[RunRequest]]
skip_message: Optional[str]
class ScheduleDefinition:
"""Define a schedule that targets a job
Args:
name (Optional[str]): The name of the schedule to create. Defaults to the job name plus
"_schedule".
cron_schedule (str): A valid cron string specifying when the schedule will run, e.g.,
'45 23 * * 6' for a schedule that runs at 11:45 PM every Saturday.
pipeline_name (Optional[str]): (legacy) The name of the pipeline to execute when the schedule runs.
execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the
schedule, which is run at an interval to determine whether a run should be launched or
not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.
This function must return a generator, which must yield either a single SkipReason
or one or more RunRequest objects.
run_config (Optional[Dict]): The config that parameterizes this execution,
as a dict.
run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Dict]]]): A function that
takes a ScheduleEvaluationContext object and returns the run configuration that
parameterizes this execution, as a dict. You may set only one of ``run_config``,
``run_config_fn``, and ``execution_fn``.
tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach
to the scheduled runs.
tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A
function that generates tags to attach to the schedules runs. Takes a
:py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string
key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.
solid_selection (Optional[List[str]]): A list of solid subselection (including single
solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
mode (Optional[str]): (legacy) The mode to apply when executing this schedule. (default: 'default')
should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs
at schedule execution time to determine whether a schedule should execute or skip. Takes
a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the
schedule should execute). Defaults to a function that always returns ``True``.
environment_vars (Optional[dict[str, str]]): The environment variables to set for the
schedule
execution_timezone (Optional[str]): Timezone in which the schedule should run.
Supported strings for timezones are the ones provided by the
`IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".
description (Optional[str]): A human-readable description of the schedule.
job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this
schedule runs.
default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default
status can be overridden from Dagit or via the GraphQL API.
"""
def __init__(
self,
name: Optional[str] = None,
cron_schedule: Optional[str] = None,
pipeline_name: Optional[str] = None,
run_config: Optional[Any] = None,
run_config_fn: Optional[RunConfigEvaluationFunction] = None,
tags: Optional[Dict[str, str]] = None,
tags_fn: Optional[Callable[..., Optional[Dict[str, str]]]] = None,
solid_selection: Optional[List[Any]] = None,
mode: Optional[str] = "default",
should_execute: Optional[Callable[..., bool]] = None,
environment_vars: Optional[Dict[str, str]] = None,
execution_timezone: Optional[str] = None,
execution_fn: Optional[
Union[Callable[[ScheduleEvaluationContext], Any], DecoratedScheduleFunction]
] = None,
description: Optional[str] = None,
job: Optional[
Union[GraphDefinition, PipelineDefinition, UnresolvedAssetJobDefinition]
] = None,
default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,
):
self._cron_schedule = check.str_param(cron_schedule, "cron_schedule")
if not is_valid_cron_string(self._cron_schedule):
raise DagsterInvalidDefinitionError(
f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "
"Dagster recognizes standard cron expressions consisting of 5 fields."
)
if job is not None:
self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)
else:
self._target = RepoRelativeTarget(
pipeline_name=check.str_param(pipeline_name, "pipeline_name"),
mode=check.opt_str_param(mode, "mode") or DEFAULT_MODE_NAME,
solid_selection=check.opt_nullable_list_param(
solid_selection, "solid_selection", of_type=str
),
)
if name:
self._name = check_valid_name(name)
elif pipeline_name:
self._name = pipeline_name + "_schedule"
elif job:
self._name = job.name + "_schedule"
self._description = check.opt_str_param(description, "description")
self._environment_vars = check.opt_dict_param(
environment_vars, "environment_vars", key_type=str, value_type=str
)
self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")
if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):
raise DagsterInvalidDefinitionError(
"Attempted to provide both execution_fn and individual run_config/tags arguments "
"to ScheduleDefinition. Must provide only one of the two."
)
elif execution_fn:
self._execution_fn: Optional[
Union[Callable[..., Any], DecoratedScheduleFunction]
] = None
if isinstance(execution_fn, DecoratedScheduleFunction):
self._execution_fn = execution_fn
else:
self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")
self._run_config_fn = None
else:
if run_config_fn and run_config:
raise DagsterInvalidDefinitionError(
"Attempted to provide both run_config_fn and run_config as arguments"
" to ScheduleDefinition. Must provide only one of the two."
)
# pylint: disable=unused-argument
def _default_run_config_fn(context: ScheduleEvaluationContext) -> RunConfig:
return check.opt_dict_param(run_config, "run_config")
self._run_config_fn = check.opt_callable_param(
run_config_fn, "run_config_fn", default=_default_run_config_fn
)
if tags_fn and tags:
raise DagsterInvalidDefinitionError(
"Attempted to provide both tags_fn and tags as arguments"
" to ScheduleDefinition. Must provide only one of the two."
)
elif tags:
tags = validate_tags(tags, allow_reserved_tags=False)
tags_fn = lambda _context: tags
else:
tags_fn = check.opt_callable_param(
tags_fn, "tags_fn", default=lambda _context: cast(Dict[str, str], {})
)
should_execute = check.opt_callable_param(
should_execute, "should_execute", default=lambda _context: True
)
def _execution_fn(context):
with user_code_error_boundary(
ScheduleExecutionError,
lambda: f"Error occurred during the execution of should_execute for schedule {name}",
):
if not should_execute(context):
yield SkipReason(
"should_execute function for {schedule_name} returned false.".format(
schedule_name=name
)
)
return
with user_code_error_boundary(
ScheduleExecutionError,
lambda: f"Error occurred during the execution of run_config_fn for schedule {name}",
):
run_config_fn = check.not_none(self._run_config_fn)
evaluated_run_config = copy.deepcopy(
run_config_fn(context)
if is_context_provided(run_config_fn)
else run_config_fn() # type: ignore
)
with user_code_error_boundary(
ScheduleExecutionError,
lambda: f"Error occurred during the execution of tags_fn for schedule {name}",
):
evaluated_tags = validate_tags(tags_fn(context), allow_reserved_tags=False)
yield RunRequest(
run_key=None,
run_config=evaluated_run_config,
tags=evaluated_tags,
)
self._execution_fn = _execution_fn
if self._execution_timezone:
try:
# Verify that the timezone can be loaded
pendulum.tz.timezone(self._execution_timezone)
except Exception as e:
raise DagsterInvalidDefinitionError(
f"Invalid execution timezone {self._execution_timezone} for {name}"
) from e
self._default_status = check.inst_param(
default_status, "default_status", DefaultScheduleStatus
)
def __call__(self, *args, **kwargs):
from .decorators.schedule_decorator import DecoratedScheduleFunction
if not isinstance(self._execution_fn, DecoratedScheduleFunction):
raise DagsterInvalidInvocationError(
"Schedule invocation is only supported for schedules created via the schedule "
"decorators."
)
result = None
if self._execution_fn.has_context_arg:
if len(args) == 0 and len(kwargs) == 0:
raise DagsterInvalidInvocationError(
"Schedule decorated function has context argument, but no context argument was "
"provided when invoking."
)
if len(args) + len(kwargs) > 1:
raise DagsterInvalidInvocationError(
"Schedule invocation received multiple arguments. Only a first "
"positional context parameter should be provided when invoking."
)
context_param_name = get_function_params(self._execution_fn.decorated_fn)[0].name
if args:
context = check.opt_inst_param(
args[0], context_param_name, ScheduleEvaluationContext
)
else:
if context_param_name not in kwargs:
raise DagsterInvalidInvocationError(
f"Schedule invocation expected argument '{context_param_name}'."
)
context = check.opt_inst_param(
kwargs[context_param_name], context_param_name, ScheduleEvaluationContext
)
context = context if context else build_schedule_context()
result = self._execution_fn.decorated_fn(context) # type: ignore
else:
if len(args) + len(kwargs) > 0:
raise DagsterInvalidInvocationError(
"Decorated schedule function takes no arguments, but arguments were provided."
)
result = self._execution_fn.decorated_fn() # type: ignore
if isinstance(result, dict):
return copy.deepcopy(result)
else:
return result
@property
def name(self) -> str:
return self._name
@property
def pipeline_name(self) -> str:
return self._target.pipeline_name
@property
def solid_selection(self) -> Optional[List[Any]]:
return self._target.solid_selection
@property
def mode(self) -> str:
return self._target.mode
@property
def description(self) -> Optional[str]:
return self._description
@property
def cron_schedule(self) -> str:
return self._cron_schedule
@property
def environment_vars(self) -> Dict[str, str]:
return self._environment_vars
@property
def execution_timezone(self) -> Optional[str]:
return self._execution_timezone
@property
def job(self) -> Union[GraphDefinition, PipelineDefinition, UnresolvedAssetJobDefinition]:
if isinstance(self._target, DirectTarget):
return self._target.target
raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")
def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:
"""Evaluate schedule using the provided context.
Args:
context (ScheduleEvaluationContext): The context with which to evaluate this schedule.
Returns:
ScheduleExecutionData: Contains list of run requests, or skip message if present.
"""
check.inst_param(context, "context", ScheduleEvaluationContext)
execution_fn: Callable[[ScheduleEvaluationContext], "ScheduleEvaluationFunctionReturn"]
if isinstance(self._execution_fn, DecoratedScheduleFunction):
execution_fn = self._execution_fn.wrapped_fn
else:
execution_fn = cast(
Callable[[ScheduleExecutionContext], "ScheduleEvaluationFunctionReturn"],
self._execution_fn,
)
result = list(ensure_gen(execution_fn(context)))
skip_message: Optional[str] = None
run_requests: List[RunRequest] = []
if not result or result == [None]:
run_requests = []
skip_message = "Schedule function returned an empty result"
elif len(result) == 1:
item = check.inst(result[0], (SkipReason, RunRequest))
if isinstance(item, RunRequest):
run_requests = [item]
skip_message = None
elif isinstance(item, SkipReason):
run_requests = []
skip_message = item.skip_message
else:
# NOTE: mypy is not correctly reading this cast-- not sure why
# (pyright reads it fine). Hence the type-ignores below.
result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest)) # type: ignore
check.invariant(
not any(not request.run_key for request in result), # type: ignore
"Schedules that return multiple RunRequests must specify a run_key in each RunRequest",
)
run_requests = result # type: ignore
skip_message = None
# clone all the run requests with the required schedule tags
run_requests_with_schedule_tags = [
RunRequest(
run_key=request.run_key,
run_config=request.run_config,
tags=merge_dicts(request.tags, PipelineRun.tags_for_schedule(self)),
)
for request in run_requests
]
return ScheduleExecutionData(
run_requests=run_requests_with_schedule_tags, skip_message=skip_message
)
def has_loadable_target(self):
return isinstance(self._target, DirectTarget)
@property
def targets_unresolved_asset_job(self) -> bool:
return self.has_loadable_target() and isinstance(
self.load_target(), UnresolvedAssetJobDefinition
)
def load_target(
self,
) -> Union[GraphDefinition, PipelineDefinition, UnresolvedAssetJobDefinition]:
if isinstance(self._target, DirectTarget):
return self._target.load()
check.failed("Target is not loadable")
@property
def default_status(self) -> DefaultScheduleStatus:
return self._default_status