In [3]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator

In [5]:
# Automatic pagination w/ incremental loading
@dlt.resource(name="rides", write_disposition="append")
def ny_taxi_inc_paginated_getter(
    cursor_date=dlt.sources.incremental(
        "Trip_Dropoff_DateTime",   # <--- field to track
        initial_value="2009-06-15",   # <--- start date June 15, 2009
        )
    ):
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"):
        yield page

In [10]:
# Define dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="load_ny_taxi_data_inc",
    destination="duckdb",
    dataset_name="ny_taxi_data"
)

# Load to DuckDB
load_info = pipeline.run(ny_taxi_inc_paginated_getter)

print('\nPrinting pipeline last trace...')
print(pipeline.last_trace)


Printing pipeline last trace...
Run started at 2025-02-17 17:32:27.546816+00:00 and COMPLETED in 20.29 seconds with 4 steps.
Step extract COMPLETED in 20.17 seconds.

Load package 1739813547.647686 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.04 seconds.
No data found to normalize

Step load COMPLETED in 0.01 seconds.
Pipeline load_ny_taxi_data_inc load step completed in ---
0 load package(s) were loaded to destination duckdb and into dataset None
The duckdb destination used duckdb:////Users/elijahsutton/Projects/de-zoomcamp-2025/workshop_dlt/pipelines/load_ny_taxi_data_inc.duckdb location to store data

Step run COMPLETED in 20.29 seconds.
Pipeline load_ny_taxi_data_inc load step completed in ---
0 load package(s) were loaded to destination duckdb and into dataset None
The duckdb destination used duckdb:////Users/elijahsutton/Projects/de-zoomcamp-2025/workshop_dlt/pipelines/load_ny_taxi_data_inc.duckdb location to store

In [12]:
# Check earliest date in loaded data
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            MIN(trip_dropoff_date_time)
            FROM rides;
            """
        )
    print(res)

[(datetime.datetime(2009, 6, 15, 0, 6, tzinfo=<UTC>),)]
