-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ingest): add a transformer for adding ownership (#2532)
- Loading branch information
Showing
5 changed files
with
239 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
96 changes: 96 additions & 0 deletions
96
metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
from typing import Callable, Iterable, List, Union | ||
|
||
import datahub.emitter.mce_builder as builder | ||
from datahub.configuration.common import ConfigModel | ||
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope | ||
from datahub.ingestion.api.transform import Transformer | ||
from datahub.metadata.schema_classes import ( | ||
AuditStampClass, | ||
DatasetSnapshotClass, | ||
MetadataChangeEventClass, | ||
OwnerClass, | ||
OwnershipClass, | ||
OwnershipTypeClass, | ||
) | ||
|
||
|
||
class AddDatasetOwnershipConfig(ConfigModel): | ||
# Workaround for https://github.com/python/mypy/issues/708. | ||
# Suggested by https://stackoverflow.com/a/64528725/5004662. | ||
get_owners_to_add: Union[ | ||
Callable[[DatasetSnapshotClass], List[OwnerClass]], | ||
Callable[[DatasetSnapshotClass], List[OwnerClass]], | ||
] | ||
default_actor: str = builder.make_user_urn("etl") | ||
|
||
|
||
class AddDatasetOwnership(Transformer): | ||
"""Transformer that adds owners to datasets according to a callback function.""" | ||
|
||
ctx: PipelineContext | ||
config: AddDatasetOwnershipConfig | ||
|
||
def __init__(self, config: AddDatasetOwnershipConfig, ctx: PipelineContext): | ||
self.ctx = ctx | ||
self.config = config | ||
|
||
@classmethod | ||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetOwnership": | ||
config = AddDatasetOwnershipConfig.parse_obj(config_dict) | ||
return cls(config, ctx) | ||
|
||
def transform( | ||
self, record_envelopes: Iterable[RecordEnvelope] | ||
) -> Iterable[RecordEnvelope]: | ||
for envelope in record_envelopes: | ||
if isinstance(envelope.record, MetadataChangeEventClass): | ||
envelope.record = self.transform_one(envelope.record) | ||
yield envelope | ||
|
||
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: | ||
if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): | ||
return mce | ||
|
||
owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot) | ||
if owners_to_add: | ||
ownership = builder.get_or_add_aspect( | ||
mce, | ||
OwnershipClass( | ||
owners=[], | ||
lastModified=AuditStampClass( | ||
time=builder.get_sys_time(), | ||
actor=self.config.default_actor, | ||
), | ||
), | ||
) | ||
ownership.owners.extend(owners_to_add) | ||
|
||
return mce | ||
|
||
|
||
class SimpleDatasetOwnershipConfig(ConfigModel): | ||
owner_urns: List[str] | ||
default_actor: str = builder.make_user_urn("etl") | ||
|
||
|
||
class SimpleAddDatasetOwnership(AddDatasetOwnership): | ||
"""Transformer that adds a specified set of owners to each dataset.""" | ||
|
||
def __init__(self, config: SimpleDatasetOwnershipConfig, ctx: PipelineContext): | ||
owners = [ | ||
OwnerClass(owner=owner, type=OwnershipTypeClass.DATAOWNER) | ||
for owner in config.owner_urns | ||
] | ||
|
||
generic_config = AddDatasetOwnershipConfig( | ||
get_owners_to_add=lambda _: owners, | ||
default_actor=config.default_actor, | ||
) | ||
super().__init__(generic_config, ctx) | ||
|
||
@classmethod | ||
def create( | ||
cls, config_dict: dict, ctx: PipelineContext | ||
) -> "SimpleAddDatasetOwnership": | ||
config = SimpleDatasetOwnershipConfig.parse_obj(config_dict) | ||
return cls(config, ctx) |
5 changes: 5 additions & 0 deletions
5
metadata-ingestion/src/datahub/ingestion/transformer/transform_registry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,9 @@ | ||
from datahub.ingestion.api.registry import Registry | ||
from datahub.ingestion.api.transform import Transformer | ||
|
||
from .add_dataset_ownership import AddDatasetOwnership, SimpleAddDatasetOwnership | ||
|
||
transform_registry = Registry[Transformer]() | ||
|
||
transform_registry.register("add_dataset_ownership", AddDatasetOwnership) | ||
transform_registry.register("simple_add_dataset_ownership", SimpleAddDatasetOwnership) |
87 changes: 87 additions & 0 deletions
87
metadata-ingestion/tests/unit/test_transform_dataset_ownership.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import datahub.emitter.mce_builder as builder | ||
import datahub.metadata.schema_classes as models | ||
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope | ||
from datahub.ingestion.transformer.add_dataset_ownership import ( | ||
SimpleAddDatasetOwnership, | ||
) | ||
|
||
|
||
def test_simple_dataset_ownership_tranformation(mock_time): | ||
no_owner_aspect = models.MetadataChangeEventClass( | ||
proposedSnapshot=models.DatasetSnapshotClass( | ||
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)", | ||
aspects=[ | ||
models.StatusClass(removed=False), | ||
], | ||
), | ||
) | ||
with_owner_aspect = models.MetadataChangeEventClass( | ||
proposedSnapshot=models.DatasetSnapshotClass( | ||
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", | ||
aspects=[ | ||
models.OwnershipClass( | ||
owners=[ | ||
models.OwnerClass( | ||
owner=builder.make_user_urn("fake_owner"), | ||
type=models.OwnershipTypeClass.DATAOWNER, | ||
), | ||
], | ||
lastModified=models.AuditStampClass( | ||
time=builder.get_sys_time(), actor="urn:li:corpuser:datahub" | ||
), | ||
) | ||
], | ||
), | ||
) | ||
|
||
not_a_dataset = models.MetadataChangeEventClass( | ||
proposedSnapshot=models.DataJobSnapshotClass( | ||
urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", | ||
aspects=[ | ||
models.DataJobInfoClass( | ||
name="User Deletions", | ||
description="Constructs the fct_users_deleted from logging_events", | ||
type=models.AzkabanJobTypeClass.SQL, | ||
) | ||
], | ||
) | ||
) | ||
|
||
inputs = [ | ||
no_owner_aspect, | ||
with_owner_aspect, | ||
not_a_dataset, | ||
] | ||
|
||
transformer = SimpleAddDatasetOwnership.create( | ||
{ | ||
"owner_urns": [ | ||
builder.make_user_urn("person1"), | ||
builder.make_user_urn("person2"), | ||
] | ||
}, | ||
PipelineContext(run_id="test"), | ||
) | ||
|
||
outputs = list( | ||
transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) | ||
) | ||
|
||
assert len(outputs) == len(inputs) | ||
|
||
# Check the first entry. | ||
first_ownership_aspect = builder.get_aspect_if_available( | ||
outputs[0].record, models.OwnershipClass | ||
) | ||
assert first_ownership_aspect | ||
assert len(first_ownership_aspect.owners) == 2 | ||
|
||
# Check the second entry. | ||
second_ownership_aspect = builder.get_aspect_if_available( | ||
outputs[1].record, models.OwnershipClass | ||
) | ||
assert second_ownership_aspect | ||
assert len(second_ownership_aspect.owners) == 3 | ||
|
||
# Verify that the third entry is unchanged. | ||
assert inputs[2] == outputs[2].record |