In [1]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator


@dlt.resource(name="rides", write_disposition="append")
def ny_taxi(
    cursor_date=dlt.sources.incremental(
        "Trip_Dropoff_DateTime",   # <--- field to track, our timestamp
        initial_value="2009-06-15",   # <--- start date June 15, 2009
        )
    ):
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"):
        yield page



In [2]:
# define new dlt pipeline
pipeline = dlt.pipeline(pipeline_name="ny_taxi", destination="duckdb", dataset_name="ny_taxi_data")

# run the pipeline with the new resource
load_info = pipeline.run(ny_taxi)
print(pipeline.last_trace)

Run started at 2025-02-11 00:41:39.236404+00:00 and COMPLETED in 23.13 seconds with 4 steps.
Step extract COMPLETED in 22.26 seconds.

Load package 1739234499.270241 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.36 seconds.
Normalized data for the following tables:
- rides: 5325 row(s)
- _dlt_pipeline_state: 1 row(s)

Load package 1739234499.270241 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 0.48 seconds.
Pipeline ny_taxi load step completed in 0.40 seconds
1 load package(s) were loaded to destination duckdb and into dataset ny_taxi_data
The duckdb destination used duckdb:////Users/ajain17/Documents/Developer/de-zoomcamp-homework-2025/ny_taxi.duckdb location to store data
Load package 1739234499.270241 is LOADED and contains no failed jobs

Step run COMPLETED in 23.13 seconds.
Pipeline ny_taxi load step completed in 0.40 seconds
1 load package(s) were loaded to de

In [3]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            MIN(trip_dropoff_date_time)
            FROM rides;
            """
        )
    print(res)

[(datetime.datetime(2009, 6, 15, 0, 6, tzinfo=<UTC>),)]


In [4]:
# Since we're already the oldest data, we can just run the pipeline again and NO new data will be laoded
pipeline = dlt.pipeline(pipeline_name="ny_taxi", destination="duckdb", dataset_name="ny_taxi_data")


# run the pipeline with the new resource
load_info = pipeline.run(ny_taxi)
print(pipeline.last_trace)

Run started at 2025-02-11 00:43:35.182007+00:00 and COMPLETED in 22.79 seconds with 4 steps.
Step extract COMPLETED in 22.71 seconds.

Load package 1739234615.244511 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.03 seconds.
No data found to normalize

Step load COMPLETED in 0.01 seconds.
Pipeline ny_taxi load step completed in ---
0 load package(s) were loaded to destination duckdb and into dataset None
The duckdb destination used duckdb:////Users/ajain17/Documents/Developer/de-zoomcamp-homework-2025/ny_taxi.duckdb location to store data

Step run COMPLETED in 22.79 seconds.
Pipeline ny_taxi load step completed in ---
0 load package(s) were loaded to destination duckdb and into dataset None
The duckdb destination used duckdb:////Users/ajain17/Documents/Developer/de-zoomcamp-homework-2025/ny_taxi.duckdb location to store data
