dlt-hub · sh-rp · Mar 20, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py
@@ -55,6 +55,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext):
     insert_values_writer_type: str = "default"
     supports_multiple_statements: bool = True
     supports_clone_table: bool = False
+    max_table_nesting: Optional[int] = None  # destination can overwrite max table nesting
     """Destination supports CREATE TABLE ... CLONE ... statements"""
 
     # do not allow to create default value, destination caps must be always explicitly inserted into container

diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -260,6 +260,27 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]:
         return []
 
 
+class DoNothingJob(LoadJob):
+    """The most lazy class of dlt"""
+
+    def __init__(self, file_path: str) -> None:
+        super().__init__(FileStorage.get_file_name_from_file_path(file_path))
+
+    def state(self) -> TLoadJobState:
+        # this job is always done
+        return "completed"
+
+    def exception(self) -> str:
+        # this part of code should be never reached
+        raise NotImplementedError()
+
+
+class DoNothingFollowupJob(DoNothingJob, FollowupJob):
+    """The second most lazy class of dlt"""
+
+    pass
+
+
 class JobClientBase(ABC):
     capabilities: ClassVar[DestinationCapabilitiesContext] = None
 

diff --git a/dlt/common/normalizers/configuration.py b/dlt/common/normalizers/configuration.py
@@ -5,7 +5,7 @@
 from dlt.common.configuration.specs import BaseConfiguration
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.normalizers.typing import TJSONNormalizer
-from dlt.common.typing import StrAny
+from dlt.common.typing import DictStrAny
 
 
 @configspec
@@ -14,14 +14,24 @@ class NormalizersConfiguration(BaseConfiguration):
     __section__: str = "schema"
 
     naming: Optional[str] = None
-    json_normalizer: Optional[StrAny] = None
+    json_normalizer: Optional[DictStrAny] = None
     destination_capabilities: Optional[DestinationCapabilitiesContext] = None  # injectable
 
     def on_resolved(self) -> None:
         # get naming from capabilities if not present
         if self.naming is None:
             if self.destination_capabilities:
                 self.naming = self.destination_capabilities.naming_convention
+        # if max_table_nesting is set, we need to set the max_table_nesting in the json_normalizer
+        if (
+            self.destination_capabilities
+            and self.destination_capabilities.max_table_nesting is not None
+        ):
+            self.json_normalizer = self.json_normalizer or {}
+            self.json_normalizer.setdefault("config", {})
+            self.json_normalizer["config"][
+                "max_nesting"
+            ] = self.destination_capabilities.max_table_nesting
 
     if TYPE_CHECKING:
 

diff --git a/dlt/common/normalizers/utils.py b/dlt/common/normalizers/utils.py
@@ -34,9 +34,11 @@ def import_normalizers(
     """
     # add defaults to normalizer_config
     normalizers_config["names"] = names = normalizers_config["names"] or "snake_case"
-    normalizers_config["json"] = item_normalizer = normalizers_config["json"] or {
-        "module": "dlt.common.normalizers.json.relational"
-    }
+    # set default json normalizer module
+    normalizers_config["json"] = item_normalizer = normalizers_config.get("json") or {}
+    if "module" not in item_normalizer:
+        item_normalizer["module"] = "dlt.common.normalizers.json.relational"
+
     try:
         if "." in names:
             # TODO: bump schema engine version and migrate schema. also change the name in  TNormalizersConfig from names to naming

diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py
@@ -23,6 +23,8 @@ def destination(
     batch_size: int = 10,
     name: str = None,
     naming_convention: str = "direct",
+    skip_dlt_columns_and_tables: bool = True,
+    max_table_nesting: int = 0,
     spec: Type[GenericDestinationClientConfiguration] = GenericDestinationClientConfiguration,
 ) -> Callable[
     [Callable[Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any]],
@@ -49,6 +51,8 @@ def wrapper(
                 batch_size=batch_size,
                 destination_name=name,
                 naming_convention=naming_convention,
+                skip_dlt_columns_and_tables=skip_dlt_columns_and_tables,
+                max_table_nesting=max_table_nesting,
                 **kwargs,  # type: ignore
             )
 

diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -37,7 +37,7 @@
 from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition, TTableFormat
 from dlt.common.schema.utils import table_schema_has_type, get_table_format
 from dlt.common.destination import DestinationCapabilitiesContext
-from dlt.common.destination.reference import LoadJob, FollowupJob
+from dlt.common.destination.reference import LoadJob, DoNothingFollowupJob, DoNothingJob
 from dlt.common.destination.reference import TLoadJobState, NewLoadJob, SupportsStagingDestination
 from dlt.common.storages import FileStorage
 from dlt.common.data_writers.escape import escape_bigquery_identifier
@@ -149,27 +149,6 @@ def __init__(self) -> None:
         DLTAthenaFormatter._INSTANCE = self
 
 
-class DoNothingJob(LoadJob):
-    """The most lazy class of dlt"""
-
-    def __init__(self, file_path: str) -> None:
-        super().__init__(FileStorage.get_file_name_from_file_path(file_path))
-
-    def state(self) -> TLoadJobState:
-        # this job is always done
-        return "completed"
-
-    def exception(self) -> str:
-        # this part of code should be never reached
-        raise NotImplementedError()
-
-
-class DoNothingFollowupJob(DoNothingJob, FollowupJob):
-    """The second most lazy class of dlt"""
-
-    pass
-
-
 class AthenaSQLClient(SqlClientBase[Connection]):
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
     dbapi: ClassVar[DBApi] = pyathena

diff --git a/dlt/destinations/impl/destination/__init__.py b/dlt/destinations/impl/destination/__init__.py
@@ -1,14 +1,17 @@
+from typing import Optional
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.data_writers import TLoaderFileFormat
 
 
 def capabilities(
     preferred_loader_file_format: TLoaderFileFormat = "puae-jsonl",
     naming_convention: str = "direct",
+    max_table_nesting: Optional[int] = 0,
 ) -> DestinationCapabilitiesContext:
     caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format)
     caps.supported_loader_file_formats = ["puae-jsonl", "parquet"]
     caps.supports_ddl_transactions = False
     caps.supports_transactions = False
     caps.naming_convention = naming_convention
+    caps.max_table_nesting = max_table_nesting
     return caps
diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py
@@ -20,6 +20,8 @@ class GenericDestinationClientConfiguration(DestinationClientConfiguration):
     destination_callable: Optional[Union[str, TDestinationCallable]] = None  # noqa: A003
     loader_file_format: TLoaderFileFormat = "puae-jsonl"
     batch_size: int = 10
+    skip_dlt_columns_and_tables: bool = True
+    max_table_nesting: int = 0
 
     if TYPE_CHECKING:
 

diff --git a/dlt/destinations/impl/destination/destination.py b/dlt/destinations/impl/destination/destination.py
@@ -1,7 +1,9 @@
 from abc import ABC, abstractmethod
 from types import TracebackType
-from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict
+from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict, List
+from copy import deepcopy
 
+from dlt.common.destination.reference import LoadJob
 from dlt.destinations.job_impl import EmptyLoadJob
 from dlt.common.typing import TDataItems, AnyFun
 from dlt.common import json
@@ -18,6 +20,7 @@
 from dlt.common.destination.reference import (
     TLoadJobState,
     LoadJob,
+    DoNothingJob,
     JobClientBase,
 )
 
@@ -37,6 +40,7 @@ def __init__(
         schema: Schema,
         destination_state: Dict[str, int],
         destination_callable: TDestinationCallable,
+        skipped_columns: List[str],
     ) -> None:
         super().__init__(FileStorage.get_file_name_from_file_path(file_path))
         self._file_path = file_path
@@ -47,6 +51,7 @@ def __init__(
         self._callable = destination_callable
         self._state: TLoadJobState = "running"
         self._storage_id = f"{self._parsed_file_name.table_name}.{self._parsed_file_name.file_id}"
+        self.skipped_columns = skipped_columns
         try:
             if self._config.batch_size == 0:
                 # on batch size zero we only call the callable with the filename
@@ -93,9 +98,14 @@ def run(self, start_index: int) -> Iterable[TDataItems]:
             start_index % self._config.batch_size
         ) == 0, "Batch size was changed during processing of one load package"
 
+        # on record batches we cannot drop columns, we need to
+        # select the ones we want to keep
+        keep_columns = list(self._table["columns"].keys())
         start_batch = start_index / self._config.batch_size
         with pyarrow.parquet.ParquetFile(self._file_path) as reader:
-            for record_batch in reader.iter_batches(batch_size=self._config.batch_size):
+            for record_batch in reader.iter_batches(
+                batch_size=self._config.batch_size, columns=keep_columns
+            ):
                 if start_batch > 0:
                     start_batch -= 1
                     continue
@@ -115,6 +125,9 @@ def run(self, start_index: int) -> Iterable[TDataItems]:
                 if start_index > 0:
                     start_index -= 1
                     continue
+                # skip internal columns
+                for column in self.skipped_columns:
+                    item.pop(column, None)
                 current_batch.append(item)
                 if len(current_batch) == self._config.batch_size:
                     yield current_batch
@@ -150,6 +163,17 @@ def update_stored_schema(
         return super().update_stored_schema(only_tables, expected_update)
 
     def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob:
+        # skip internal tables and remove columns from schema if so configured
+        skipped_columns: List[str] = []
+        if self.config.skip_dlt_columns_and_tables:
+            if table["name"].startswith(self.schema._dlt_tables_prefix):
+                return DoNothingJob(file_path)
+            table = deepcopy(table)
+            for column in list(table["columns"].keys()):
+                if column.startswith(self.schema._dlt_tables_prefix):
+                    table["columns"].pop(column)
+                    skipped_columns.append(column)
+
         # save our state in destination name scope
         load_state = destination_state()
         if file_path.endswith("parquet"):
@@ -160,6 +184,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
                 self.schema,
                 load_state,
                 self.destination_callable,
+                skipped_columns,
             )
         if file_path.endswith("jsonl"):
             return DestinationJsonlLoadJob(
@@ -169,6 +194,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
                 self.schema,
                 load_state,
                 self.destination_callable,
+                skipped_columns,
             )
         return None
 

diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py
@@ -36,8 +36,9 @@ class DestinationInfo(t.NamedTuple):
 class destination(Destination[GenericDestinationClientConfiguration, "DestinationClient"]):
     def capabilities(self) -> DestinationCapabilitiesContext:
         return capabilities(
-            self.config_params.get("loader_file_format", "puae-jsonl"),
-            self.config_params.get("naming_convention", "direct"),
+            preferred_loader_file_format=self.config_params.get("loader_file_format", "puae-jsonl"),
+            naming_convention=self.config_params.get("naming_convention", "direct"),
+            max_table_nesting=self.config_params.get("max_table_nesting", None),
         )
 
     @property

diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
@@ -457,6 +457,7 @@ def normalize(
 
         # make sure destination capabilities are available
         self._get_destination_capabilities()
+
         # create default normalize config
         normalize_config = NormalizeConfiguration(
             workers=workers,

diff --git a/docs/examples/custom_destination_bigquery/.dlt/config.toml b/docs/examples/custom_destination_bigquery/.dlt/config.toml
diff --git a/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml b/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml
@@ -0,0 +1,9 @@
+# you can just paste services.json as credentials
+[destination.bigquery.credentials]
+client_email = ""
+private_key = ""
+project_id = ""
+token_uri = ""
+refresh_token = ""
+client_id = ""
+client_secret = ""
diff --git a/docs/examples/custom_destination_bigquery/__init__.py b/docs/examples/custom_destination_bigquery/__init__.py
diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py
@@ -0,0 +1,71 @@
+import dlt
+import pandas as pd
+import pyarrow as pa
+from google.cloud import bigquery
+
+from dlt.common.configuration.specs import GcpServiceAccountCredentials
+
+# constants
+OWID_DISASTERS_URL = (
+    "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/"
+    "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/"
+    "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv"
+)
+# this table needs to be manually created in your gc account
+# format: "your-project.your_dataset.your_table"
+BIGQUERY_TABLE_ID = "chat-analytics-rasa-ci.ci_streaming_insert.natural-disasters"
+
+# dlt sources
+@dlt.resource(name="natural_disasters")
+def resource(url: str):
+    # load pyarrow table with pandas
+    table = pa.Table.from_pandas(pd.read_csv(url))
+    # we add a list type column to demontrate bigquery lists
+    table = table.append_column(
+        "tags",
+        pa.array(
+            [["disasters", "earthquakes", "floods", "tsunamis"]] * len(table),
+            pa.list_(pa.string()),
+        ),
+    )
+    # we add a struct type column to demonstrate bigquery structs
+    table = table.append_column(
+        "meta",
+        pa.array(
+            [{"loaded_by": "dlt"}] * len(table),
+            pa.struct([("loaded_by", pa.string())]),
+        ),
+    )
+    yield table
+
+# dlt biquery custom destination
+# we can use the dlt provided credentials class
+# to retrieve the gcp credentials from the secrets
+@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0)
+def bigquery_insert(
+    items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value
+) -> None:
+    client = bigquery.Client(
+        credentials.project_id, credentials.to_native_credentials(), location="US"
+    )
+    job_config = bigquery.LoadJobConfig(
+        autodetect=True,
+        source_format=bigquery.SourceFormat.PARQUET,
+        schema_update_options=bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
+    )
+    # since we have set the batch_size to 0, we get a filepath and can load the file directly
+    with open(items, "rb") as f:
+        load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config)
+    load_job.result()  # Waits for the job to complete.
+
+if __name__ == "__main__":
+    # run the pipeline and print load results
+    pipeline = dlt.pipeline(
+        pipeline_name="csv_to_bigquery_insert",
+        destination=bigquery_insert,
+        dataset_name="mydata",
+        full_refresh=True,
+    )
+    load_info = pipeline.run(resource(url=OWID_DISASTERS_URL))
+
+    print(load_info)