dlt-hub · rudolfix · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py
@@ -7,7 +7,6 @@
 from functools import wraps
 
 
-
 import dlt
 from dlt.common.exceptions import MissingDependencyException
 from dlt.common import pendulum, logger

diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py
@@ -313,8 +313,17 @@ def add_limit(self, max_items: int) -> "DltResource":  # noqa: A003
             "DltResource": returns self
         """
 
+        # make sure max_items is a number, to allow "None" as value for unlimited
+        if max_items is None:
+            max_items = -1
+
         def _gen_wrap(gen: TPipeStep) -> TPipeStep:
             """Wrap a generator to take the first `max_items` records"""
+
+            # zero items should produce empty generator
+            if max_items == 0:
+                return
+
             count = 0
             is_async_gen = False
             if inspect.isfunction(gen):

diff --git a/docs/examples/connector_x_arrow/load_arrow.py b/docs/examples/connector_x_arrow/load_arrow.py
@@ -3,6 +3,7 @@
 import dlt
 from dlt.sources.credentials import ConnectionStringCredentials
 
+
 def read_sql_x(
     conn_str: ConnectionStringCredentials = dlt.secrets.value,
     query: str = dlt.config.value,
@@ -14,6 +15,7 @@ def read_sql_x(
         protocol="binary",
     )
 
+
 def genome_resource():
     # create genome resource with merge on `upid` primary key
     genome = dlt.resource(

diff --git a/docs/examples/google_sheets/google_sheets.py b/docs/examples/google_sheets/google_sheets.py
@@ -9,13 +9,15 @@
 )
 from dlt.common.typing import DictStrAny, StrAny
 
+
 def _initialize_sheets(
     credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials]
 ) -> Any:
     # Build the service object.
     service = build("sheets", "v4", credentials=credentials.to_native_credentials())
     return service
 
+
 @dlt.source
 def google_spreadsheet(
     spreadsheet_id: str,
@@ -55,6 +57,7 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
         for name in sheet_names
     ]
 
+
 if __name__ == "__main__":
     pipeline = dlt.pipeline(destination="duckdb")
     # see example.secrets.toml to where to put credentials
@@ -67,4 +70,4 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
             sheet_names=range_names,
         )
     )
-    print(info)
+    print(info)
diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py
@@ -6,12 +6,11 @@
 from dlt.common.typing import TAnyDateTime
 from dlt.sources.helpers.requests import client
 
+
 @dlt.source(max_table_nesting=2)
 def zendesk_support(
     credentials: Dict[str, str] = dlt.secrets.value,
-    start_date: Optional[TAnyDateTime] = pendulum.datetime(  # noqa: B008
-        year=2000, month=1, day=1
-    ),
+    start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1),  # noqa: B008
     end_date: Optional[TAnyDateTime] = None,
 ):
     """
@@ -113,11 +112,12 @@ def get_pages(
         if not response_json["end_of_stream"]:
             get_url = response_json["next_page"]
 
+
 if __name__ == "__main__":
     # create dlt pipeline
     pipeline = dlt.pipeline(
         pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data"
     )
 
     load_info = pipeline.run(zendesk_support())
-    print(load_info)
+    print(load_info)
diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py
@@ -13,6 +13,7 @@
 
 CHUNK_SIZE = 10000
 
+
 # You can limit how deep dlt goes when generating child tables.
 # By default, the library will descend and generate child tables
 # for all nested lists, without a limit.
@@ -81,6 +82,7 @@ def load_documents(self) -> Iterator[TDataItem]:
         while docs_slice := list(islice(cursor, CHUNK_SIZE)):
             yield map_nested_in_place(convert_mongo_objs, docs_slice)
 
+
 def convert_mongo_objs(value: Any) -> Any:
     if isinstance(value, (ObjectId, Decimal128)):
         return str(value)

diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py
@@ -4,6 +4,7 @@
 from dlt.destinations.impl.weaviate import weaviate_adapter
 from PyPDF2 import PdfReader
 
+
 @dlt.resource(selected=False)
 def list_files(folder_path: str):
     folder_path = os.path.abspath(folder_path)
@@ -15,6 +16,7 @@ def list_files(folder_path: str):
             "mtime": os.path.getmtime(file_path),
         }
 
+
 @dlt.transformer(primary_key="page_id", write_disposition="merge")
 def pdf_to_text(file_item, separate_pages: bool = False):
     if not separate_pages:
@@ -28,6 +30,7 @@ def pdf_to_text(file_item, separate_pages: bool = False):
         page_item["page_id"] = file_item["file_name"] + "_" + str(page_no)
         yield page_item
 
+
 pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate")
 
 # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf"
@@ -51,4 +54,4 @@ def pdf_to_text(file_item, separate_pages: bool = False):
 
 client = weaviate.Client("http://localhost:8080")
 # get text of all the invoices in InvoiceText class we just created above
-print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
+print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
diff --git a/docs/examples/qdrant_zendesk/qdrant.py b/docs/examples/qdrant_zendesk/qdrant.py
@@ -10,13 +10,12 @@
 
 from dlt.common.configuration.inject import with_config
 
+
 # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
 @dlt.source(max_table_nesting=2)
 def zendesk_support(
     credentials: Dict[str, str] = dlt.secrets.value,
-    start_date: Optional[TAnyDateTime] = pendulum.datetime(  # noqa: B008
-        year=2000, month=1, day=1
-    ),
+    start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1),  # noqa: B008
     end_date: Optional[TAnyDateTime] = None,
 ):
     """
@@ -80,13 +79,15 @@ def _parse_date_or_none(value: Optional[str]) -> Optional[pendulum.DateTime]:
         return None
     return ensure_pendulum_datetime(value)
 
+
 # modify dates to return datetime objects instead
 def _fix_date(ticket):
     ticket["updated_at"] = _parse_date_or_none(ticket["updated_at"])
     ticket["created_at"] = _parse_date_or_none(ticket["created_at"])
     ticket["due_at"] = _parse_date_or_none(ticket["due_at"])
     return ticket
 
+
 # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
 def get_pages(
     url: str,
@@ -127,6 +128,7 @@ def get_pages(
         if not response_json["end_of_stream"]:
             get_url = response_json["next_page"]
 
+
 if __name__ == "__main__":
     # create a pipeline with an appropriate name
     pipeline = dlt.pipeline(
@@ -146,7 +148,6 @@ def get_pages(
 
     print(load_info)
 
-
     # running the Qdrant client to connect to your Qdrant database
 
     @with_config(sections=("destination", "qdrant", "credentials"))

diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/pokemon.py
@@ -1,6 +1,7 @@
 import dlt
 from dlt.sources.helpers import requests
 
+
 @dlt.source(max_table_nesting=2)
 def source(pokemon_api_url: str):
     """"""
@@ -46,6 +47,7 @@ def species(pokemon_details):
 
     return (pokemon_list | pokemon, pokemon_list | pokemon | species)
 
+
 if __name__ == "__main__":
     # build duck db pipeline
     pipeline = dlt.pipeline(
@@ -54,4 +56,4 @@ def species(pokemon_details):
 
     # the pokemon_list resource does not need to be loaded
     load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
-    print(load_info)
+    print(load_info)
diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md
@@ -362,6 +362,9 @@ assert list(r) == list(range(10))
 > 💡 You cannot limit transformers. They should process all the data they receive fully to avoid
 > inconsistencies in generated datasets.
 
+> 💡 If you are paremetrizing the value of `add_limit` and sometimes need it to be disabled, you can set `None` or `-1`
+>  to disable the limiting. You can also set the limit to `0` for the resource to not yield any items.
+
 ### Set table name and adjust schema
 
 You can change the schema of a resource, be it standalone or as a part of a source. Look for method

diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py
@@ -2,6 +2,7 @@
 from typing import Iterator
 
 import pytest
+import asyncio
 
 import dlt
 from dlt.common.configuration.container import Container
@@ -789,6 +790,31 @@ def test_limit_infinite_counter() -> None:
     assert list(r) == list(range(10))
 
 
+@pytest.mark.parametrize("limit", (None, -1, 0, 10))
+def test_limit_edge_cases(limit: int) -> None:
+    r = dlt.resource(range(20), name="infinity").add_limit(limit)  # type: ignore
+
+    @dlt.resource()
+    async def r_async():
+        for i in range(20):
+            await asyncio.sleep(0.01)
+            yield i
+
+    sync_list = list(r)
+    async_list = list(r_async().add_limit(limit))
+
+    # check the expected results
+    assert sync_list == async_list
+    if limit == 10:
+        assert sync_list == list(range(10))
+    elif limit in [None, -1]:
+        assert sync_list == list(range(20))
+    elif limit == 0:
+        assert sync_list == []
+    else:
+        raise AssertionError(f"Unexpected limit: {limit}")
+
+
 def test_limit_source() -> None:
     def mul_c(item):
         yield from "A" * (item + 2)