dlt-hub · rudolfix · Nov 21, 2023 · Aug 27, 2023 · Aug 27, 2023 · Aug 28, 2023
diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py
@@ -69,3 +69,8 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi
         self.from_engine = from_engine
         self.to_engine = to_engine
         super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}")
+
+
+class SchemaFrozenException(SchemaException):
+    def __init__(self, msg: str) -> None:
+        super().__init__(msg)
diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py
@@ -1,6 +1,6 @@
 import yaml
 from copy import copy, deepcopy
-from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast
+from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal
 from dlt.common import json
 
 from dlt.common.utils import extend_list_deduplicated
@@ -11,10 +11,13 @@
 from dlt.common.schema import utils
 from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType
 from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, TColumnSchemaBase, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema,
-                                      TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TWriteDisposition)
+                                      TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TWriteDisposition, TSchemaUpdateMode)
 from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName,
                                           ParentTableNotFoundException, SchemaCorruptedException)
 from dlt.common.validation import validate_dict
+from dlt.common.schema.exceptions import SchemaFrozenException
+
+
 
 
 class Schema:
@@ -174,7 +177,32 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D
                     updated_table_partial["columns"][new_col_name] = new_col_def
 
         return new_row, updated_table_partial
-
+
+    def check_schema_update(self, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_update_mode: TSchemaUpdateMode) -> Tuple[DictStrAny, TPartialTableSchema]:
+        """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode"""
+        has_columns = self.has_data_columns
+        # if there is a schema update and we froze schema and filter additional data, clean up
+        if has_columns and partial_table and schema_update_mode == "freeze-and-trim":
+            # do not create new tables
+            if table_name not in self.tables or not len(self.tables[table_name].get("columns", {})):
+                return None, None
+            # pop unknown values
+            for item in list(row.keys()):
+                if item not in self.tables[table_name]["columns"]:
+                    row.pop(item)
+            return row, None
+
+        # if there is a schema update and we froze schema and discard additional rows, do nothing
+        elif has_columns and partial_table and schema_update_mode == "freeze-and-discard":
+            return None, None
+
+        # if there is a schema update and we disallow any data not fitting the schema, raise!
+        elif has_columns and partial_table and schema_update_mode == "freeze-and-raise":
+            raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.")
+
+        return row, partial_table
+
+
     def update_schema(self, partial_table: TPartialTableSchema) -> TPartialTableSchema:
         table_name = partial_table["name"]
         parent_table_name = partial_table.get("parent")
@@ -324,6 +352,12 @@ def tables(self) -> TSchemaTables:
     def settings(self) -> TSchemaSettings:
         return self._settings
 
+    @property
+    def has_data_columns(self) -> bool:
+        for table in self.data_tables():
+            return bool(table.get("columns", None))
+        return False
+
     def to_pretty_json(self, remove_defaults: bool = True) -> str:
         d = self.to_dict(remove_defaults=remove_defaults)
         return json.dumps(d, pretty=True)

diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -104,3 +104,5 @@ class TStoredSchema(TypedDict, total=False):
     settings: Optional[TSchemaSettings]
     tables: TSchemaTables
     normalizers: TNormalizersConfig
+
+TSchemaUpdateMode = Literal["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"]
diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py
@@ -1,15 +1,16 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from dlt.common.configuration import configspec
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType
 from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration
-
+from dlt.common.schema.typing import TSchemaUpdateMode
 
 @configspec
 class NormalizeConfiguration(PoolRunnerConfiguration):
     pool_type: TPoolType = "process"
     destination_capabilities: DestinationCapabilitiesContext = None  # injectable
+    schema_update_mode: TSchemaUpdateMode = "evolve"
     _schema_storage_config: SchemaStorageConfiguration
     _normalize_storage_config: NormalizeStorageConfiguration
     _load_storage_config: LoadStorageConfiguration

diff --git a/dlt/normalize/exceptions.py b/dlt/normalize/exceptions.py
@@ -0,0 +1,5 @@
+from dlt.common.exceptions import DltException
+
+class NormalizeException(DltException):
+    def __init__(self, msg: str) -> None:
+        super().__init__(msg)
diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py
@@ -67,14 +67,14 @@ def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Sc
 
     @staticmethod
     def w_normalize_files(
+            normalize_config: NormalizeConfiguration,
             normalize_storage_config: NormalizeStorageConfiguration,
             loader_storage_config: LoadStorageConfiguration,
             destination_caps: DestinationCapabilitiesContext,
             stored_schema: TStoredSchema,
             load_id: str,
             extracted_items_files: Sequence[str],
         ) -> TWorkerRV:
-
         schema_updates: List[TSchemaUpdate] = []
         total_items = 0
         row_counts: TRowCount = {}
@@ -98,7 +98,7 @@ def w_normalize_files(
                         items_count = 0
                         for line_no, line in enumerate(f):
                             items: List[TDataItem] = json.loads(line)
-                            partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items)
+                            partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items)
                             schema_updates.append(partial_update)
                             total_items += items_count
                             merge_row_count(row_counts, r_counts)
@@ -127,7 +127,7 @@ def w_normalize_files(
         return schema_updates, total_items, load_storage.closed_files(), row_counts
 
     @staticmethod
-    def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]:
+    def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]:
         column_schemas: Dict[str, TTableSchemaColumns] = {}  # quick access to column schema for writers below
         schema_update: TSchemaUpdate = {}
         schema_name = schema.name
@@ -139,31 +139,38 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str,
                 # filter row, may eliminate some or all fields
                 row = schema.filter_row(table_name, row)
                 # do not process empty rows
-                if row:
-                    # decode pua types
-                    for k, v in row.items():
-                        row[k] = custom_pua_decode(v)  # type: ignore
-                    # coerce row of values into schema table, generating partial table with new columns if any
-                    row, partial_table = schema.coerce_row(table_name, parent_table, row)
-                    # theres a new table or new columns in existing table
-                    if partial_table:
-                        # update schema and save the change
-                        schema.update_schema(partial_table)
-                        table_updates = schema_update.setdefault(table_name, [])
-                        table_updates.append(partial_table)
-                        # update our columns
-                        column_schemas[table_name] = schema.get_table_columns(table_name)
-                    # get current columns schema
-                    columns = column_schemas.get(table_name)
-                    if not columns:
-                        columns = schema.get_table_columns(table_name)
-                        column_schemas[table_name] = columns
-                    # store row
-                    # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock
-                    load_storage.write_data_item(load_id, schema_name, table_name, row, columns)
-                    # count total items
-                    items_count += 1
-                    increase_row_count(row_counts, table_name, 1)
+                if not row:
+                    continue
+                # decode pua types
+                for k, v in row.items():
+                    row[k] = custom_pua_decode(v)  # type: ignore
+                # coerce row of values into schema table, generating partial table with new columns if any
+                row, partial_table = schema.coerce_row(table_name, parent_table, row)
+                # check update
+                row, partial_table = schema.check_schema_update(table_name, row, partial_table, config.schema_update_mode)
+
+                if not row:
+                    continue
+
+                # theres a new table or new columns in existing table
+                if partial_table:
+                    # update schema and save the change
+                    schema.update_schema(partial_table)
+                    table_updates = schema_update.setdefault(table_name, [])
+                    table_updates.append(partial_table)
+                    # update our columns
+                    column_schemas[table_name] = schema.get_table_columns(table_name)
+                # get current columns schema
+                columns = column_schemas.get(table_name)
+                if not columns:
+                    columns = schema.get_table_columns(table_name)
+                    column_schemas[table_name] = columns
+                # store row
+                # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock
+                load_storage.write_data_item(load_id, schema_name, table_name, row, columns)
+                # count total items
+                items_count += 1
+                increase_row_count(row_counts, table_name, 1)
             signals.raise_if_signalled()
         return schema_update, items_count, row_counts
 
@@ -196,7 +203,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM
         workers = self.pool._processes  # type: ignore
         chunk_files = self.group_worker_files(files, workers)
         schema_dict: TStoredSchema = schema.to_dict()
-        config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict)
+        config_tuple = (self.config, self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict)
         param_chunk = [[*config_tuple, load_id, files] for files in chunk_files]
         tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = []
         row_counts: TRowCount = {}
@@ -249,6 +256,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM
 
     def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV:
         result = Normalize.w_normalize_files(
+            self.config,
             self.normalize_storage.config,
             self.load_storage.config,
             self.config.destination_capabilities,

diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py
@@ -0,0 +1,76 @@
+import dlt, os, pytest
+from dlt.common.utils import uniq_id
+
+from tests.load.pipeline.utils import load_table_counts
+from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration
+from dlt.pipeline.exceptions import PipelineStepFailed
+from dlt.common.schema.exceptions import SchemaFrozenException
+from dlt.common.schema import utils
+
+SCHEMA_UPDATE_MODES = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"]
+
+@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name)
+@pytest.mark.parametrize("update_mode", SCHEMA_UPDATE_MODES)
+def test_freeze_schema(update_mode: str, destination_config: DestinationTestConfiguration) -> None:
+
+    # freeze pipeline, drop additional values
+    # this will allow for the first run to create the schema, but will not accept further updates after that
+    os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = update_mode
+    pipeline = destination_config.setup_pipeline("test_freeze_schema_2", dataset_name="freeze" + uniq_id())
+
+    @dlt.resource(name="items", write_disposition="append")
+    def load_items():
+        global offset
+        for _, index in enumerate(range(0, 10), 1):
+            yield {
+                "id": index,
+                "name": f"item {index}"
+            }
+
+    @dlt.resource(name="items", write_disposition="append")
+    def load_items_with_subitems():
+        global offset
+        for _, index in enumerate(range(0, 10), 1):
+            yield {
+                "id": index,
+                "name": f"item {index}",
+                "new_attribute": "hello",
+                "sub_items": [{
+                    "id": index + 1000,
+                    "name": f"sub item {index + 1000}"
+                },{
+                    "id": index + 2000,
+                    "name": f"sub item {index + 2000}"
+                }]
+            }
+
+    pipeline.run([load_items], loader_file_format=destination_config.file_format)
+    table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()])
+    # check data
+    assert table_counts["items"] == 10
+    schema_hash = utils.generate_version_hash(pipeline.default_schema.to_dict())
+
+    # on freeze and raise we expect an exception
+    if update_mode == "freeze-and-raise":
+        with pytest.raises(PipelineStepFailed) as py_ex:
+            pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format)
+            assert isinstance(py_ex.value.__context__, SchemaFrozenException)
+    else:
+        pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format)
+
+    # check data
+    table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()])
+    assert table_counts["items"] == 20 if update_mode not in ["freeze-and-raise", "freeze-and-discard"] else 10
+
+    # frozen schemas should not have changed
+    if update_mode != "evolve":
+        assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict())
+        assert "items__sub_items" not in table_counts
+        # schema was not migrated to contain new attribute
+        assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"]
+    # regular mode evolves the schema
+    else:
+        assert table_counts["items__sub_items"] == 20
+        # schema was not migrated to contain new attribute
+        assert "new_attribute" in pipeline.default_schema.tables["items"]["columns"]
+