-
Notifications
You must be signed in to change notification settings - Fork 148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
schema contract #594
schema contract #594
Changes from 6 commits
78d6d71
09d8c63
aec10b1
9441dcb
edad4ad
bef7ea4
fc6f083
b36a74f
5659827
5343f4b
894875f
8bccfe5
cfd3f64
e05855c
3f07127
6308369
ab0b8d7
6e99ed9
1a10ec4
175bee5
18b9341
6dcaa7d
881d79a
e707580
b3dc41d
ac8f766
869f278
0f22ba0
463c447
db3f447
c17577a
fb1d224
6a15fa2
d66c2e6
5e238c5
bf9da7e
9ba2496
038d03a
84384c3
44dfb69
ece2bfc
7b8f2d2
00c540b
3ed4630
333217c
d69e54d
041da6d
b72a1a9
f2abadf
2ae36e3
d85e04f
1d7be25
96785c4
302d909
4a1fab0
903f000
912dd8b
ef1b10f
256920e
168f0da
760cc43
b645927
8989a75
57842cc
441299f
fc0eb47
9977227
d74242a
e9344ee
e980396
5e2d131
3dc4fa5
c76788a
423a163
c1c32d6
c24b643
2b97a4f
c35ec2d
340ed3d
35c10b7
2d260e8
aa990f5
a6a782b
f4d2cac
f556584
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from dlt.common.exceptions import DltException | ||
|
||
class NormalizeException(DltException): | ||
def __init__(self, msg: str) -> None: | ||
super().__init__(msg) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,14 +67,14 @@ def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Sc | |
|
||
@staticmethod | ||
def w_normalize_files( | ||
normalize_config: NormalizeConfiguration, | ||
normalize_storage_config: NormalizeStorageConfiguration, | ||
loader_storage_config: LoadStorageConfiguration, | ||
destination_caps: DestinationCapabilitiesContext, | ||
stored_schema: TStoredSchema, | ||
load_id: str, | ||
extracted_items_files: Sequence[str], | ||
) -> TWorkerRV: | ||
|
||
schema_updates: List[TSchemaUpdate] = [] | ||
total_items = 0 | ||
row_counts: TRowCount = {} | ||
|
@@ -98,7 +98,7 @@ def w_normalize_files( | |
items_count = 0 | ||
for line_no, line in enumerate(f): | ||
items: List[TDataItem] = json.loads(line) | ||
partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items) | ||
partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items) | ||
schema_updates.append(partial_update) | ||
total_items += items_count | ||
merge_row_count(row_counts, r_counts) | ||
|
@@ -127,7 +127,7 @@ def w_normalize_files( | |
return schema_updates, total_items, load_storage.closed_files(), row_counts | ||
|
||
@staticmethod | ||
def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: | ||
def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: | ||
column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below | ||
schema_update: TSchemaUpdate = {} | ||
schema_name = schema.name | ||
|
@@ -139,31 +139,38 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, | |
# filter row, may eliminate some or all fields | ||
row = schema.filter_row(table_name, row) | ||
# do not process empty rows | ||
if row: | ||
# decode pua types | ||
for k, v in row.items(): | ||
row[k] = custom_pua_decode(v) # type: ignore | ||
# coerce row of values into schema table, generating partial table with new columns if any | ||
row, partial_table = schema.coerce_row(table_name, parent_table, row) | ||
# theres a new table or new columns in existing table | ||
if partial_table: | ||
# update schema and save the change | ||
schema.update_schema(partial_table) | ||
table_updates = schema_update.setdefault(table_name, []) | ||
table_updates.append(partial_table) | ||
# update our columns | ||
column_schemas[table_name] = schema.get_table_columns(table_name) | ||
# get current columns schema | ||
columns = column_schemas.get(table_name) | ||
if not columns: | ||
columns = schema.get_table_columns(table_name) | ||
column_schemas[table_name] = columns | ||
# store row | ||
# TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock | ||
load_storage.write_data_item(load_id, schema_name, table_name, row, columns) | ||
# count total items | ||
items_count += 1 | ||
increase_row_count(row_counts, table_name, 1) | ||
if not row: | ||
continue | ||
# decode pua types | ||
for k, v in row.items(): | ||
row[k] = custom_pua_decode(v) # type: ignore | ||
# coerce row of values into schema table, generating partial table with new columns if any | ||
row, partial_table = schema.coerce_row(table_name, parent_table, row) | ||
# check update | ||
row, partial_table = schema.check_schema_update(table_name, row, partial_table, config.schema_update_mode) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't call it if partial table is None! this is 99.99% of cases and this code is time critical |
||
|
||
if not row: | ||
continue | ||
|
||
# theres a new table or new columns in existing table | ||
if partial_table: | ||
# update schema and save the change | ||
schema.update_schema(partial_table) | ||
table_updates = schema_update.setdefault(table_name, []) | ||
table_updates.append(partial_table) | ||
# update our columns | ||
column_schemas[table_name] = schema.get_table_columns(table_name) | ||
# get current columns schema | ||
columns = column_schemas.get(table_name) | ||
if not columns: | ||
columns = schema.get_table_columns(table_name) | ||
column_schemas[table_name] = columns | ||
# store row | ||
# TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock | ||
load_storage.write_data_item(load_id, schema_name, table_name, row, columns) | ||
# count total items | ||
items_count += 1 | ||
increase_row_count(row_counts, table_name, 1) | ||
signals.raise_if_signalled() | ||
return schema_update, items_count, row_counts | ||
|
||
|
@@ -196,7 +203,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM | |
workers = self.pool._processes # type: ignore | ||
chunk_files = self.group_worker_files(files, workers) | ||
schema_dict: TStoredSchema = schema.to_dict() | ||
config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) | ||
config_tuple = (self.config, self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just FYI: this goes through process boundary and will be pickled. pay attention |
||
param_chunk = [[*config_tuple, load_id, files] for files in chunk_files] | ||
tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = [] | ||
row_counts: TRowCount = {} | ||
|
@@ -249,6 +256,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM | |
|
||
def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: | ||
result = Normalize.w_normalize_files( | ||
self.config, | ||
self.normalize_storage.config, | ||
self.load_storage.config, | ||
self.config.destination_capabilities, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import dlt, os, pytest | ||
from dlt.common.utils import uniq_id | ||
|
||
from tests.load.pipeline.utils import load_table_counts | ||
from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration | ||
from dlt.pipeline.exceptions import PipelineStepFailed | ||
from dlt.common.schema.exceptions import SchemaFrozenException | ||
from dlt.common.schema import utils | ||
|
||
SCHEMA_UPDATE_MODES = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] | ||
|
||
@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) | ||
@pytest.mark.parametrize("update_mode", SCHEMA_UPDATE_MODES) | ||
def test_freeze_schema(update_mode: str, destination_config: DestinationTestConfiguration) -> None: | ||
|
||
# freeze pipeline, drop additional values | ||
# this will allow for the first run to create the schema, but will not accept further updates after that | ||
os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = update_mode | ||
pipeline = destination_config.setup_pipeline("test_freeze_schema_2", dataset_name="freeze" + uniq_id()) | ||
|
||
@dlt.resource(name="items", write_disposition="append") | ||
def load_items(): | ||
global offset | ||
for _, index in enumerate(range(0, 10), 1): | ||
yield { | ||
"id": index, | ||
"name": f"item {index}" | ||
} | ||
|
||
@dlt.resource(name="items", write_disposition="append") | ||
def load_items_with_subitems(): | ||
global offset | ||
for _, index in enumerate(range(0, 10), 1): | ||
yield { | ||
"id": index, | ||
"name": f"item {index}", | ||
"new_attribute": "hello", | ||
"sub_items": [{ | ||
"id": index + 1000, | ||
"name": f"sub item {index + 1000}" | ||
},{ | ||
"id": index + 2000, | ||
"name": f"sub item {index + 2000}" | ||
}] | ||
} | ||
|
||
pipeline.run([load_items], loader_file_format=destination_config.file_format) | ||
table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) | ||
# check data | ||
assert table_counts["items"] == 10 | ||
schema_hash = utils.generate_version_hash(pipeline.default_schema.to_dict()) | ||
|
||
# on freeze and raise we expect an exception | ||
if update_mode == "freeze-and-raise": | ||
with pytest.raises(PipelineStepFailed) as py_ex: | ||
pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) | ||
assert isinstance(py_ex.value.__context__, SchemaFrozenException) | ||
else: | ||
pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) | ||
|
||
# check data | ||
table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) | ||
assert table_counts["items"] == 20 if update_mode not in ["freeze-and-raise", "freeze-and-discard"] else 10 | ||
|
||
# frozen schemas should not have changed | ||
if update_mode != "evolve": | ||
assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) | ||
assert "items__sub_items" not in table_counts | ||
# schema was not migrated to contain new attribute | ||
assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"] | ||
# regular mode evolves the schema | ||
else: | ||
assert table_counts["items__sub_items"] == 20 | ||
# schema was not migrated to contain new attribute | ||
assert "new_attribute" in pipeline.default_schema.tables["items"]["columns"] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is a solid implementation. but I'd like to describe it in the ticket (we can resue it for the docs) and maybe modify the requirements.