### Install necessary packages
Install dlt with duckdb:

pip install dlt[duckdb]

Install streamlit to visualize the duckdb data:

pip install streamlit

# Run a simple pipeline using dlt

In [14]:
# load packages
import dlt
import pandas as pd

In [15]:
# load data
data = [
    {
        "vendor_name": "VTS",
		"record_hash": "b00361a396177a9cb410ff61f20015ad",
        "time": {
            "pickup": "2009-06-14 23:23:00",
            "dropoff": "2009-06-14 23:48:00"
        },
        "Trip_Distance": 17.52,
        "coordinates": {
            "start": {
                "lon": -73.787442,
                "lat": 40.641525
            },
            "end": {
                "lon": -73.980072,
                "lat": 40.742963
            }
        },
        "Rate_Code": None,
        "store_and_forward": None,
        "Payment": {
            "type": "Credit",
            "amt": 20.5,
            "surcharge": 0,
            "mta_tax": None,
            "tip": 9,
            "tolls": 4.15,
			"status": "booked"
        },
        "Passenger_Count": 2,
        "passengers": [
            {"name": "John", "rating": 4.9},
            {"name": "Jack", "rating": 3.9}
        ],
        "Stops": [
            {"lon": -73.6, "lat": 40.6},
            {"lon": -73.5, "lat": 40.5}
        ]
    },
]

In [16]:
# create pipeline data type using dlt to send this data to duckdb (local relational db)
pipeline = dlt.pipeline(pipeline_name="taxi_data", destination="duckdb", dataset_name="taxi_rides")

# run the pipeline with default settings and capture the outcome in info
info = pipeline.run(data=data, table_name="taxi_trips", write_disposition="replace")

print(info)

Pipeline taxi_data load step completed in 0.47 seconds
1 load package(s) were loaded to destination duckdb and into dataset taxi_rides
The duckdb destination used duckdb:////home/jdelzio/data-engineering-zoomcamp/workshop_1/taxi_data.duckdb location to store data
Load package 1707965941.5676076 is LOADED and contains no failed jobs


# Icrementally update the data stored in duckdb

In [17]:
# load new data
new_data = [
    {
        "vendor_name": "VTS",
		"record_hash": "b00361a396177a9cb410ff61f20015ad",
        "time": {
            "pickup": "2009-06-14 23:23:00",
            "dropoff": "2009-06-14 23:48:00"
        },
        "Trip_Distance": 17.52,
        "coordinates": {
            "start": {
                "lon": -73.787442,
                "lat": 40.641525
            },
            "end": {
                "lon": -73.980072,
                "lat": 40.742963
            }
        },
        "Rate_Code": None,
        "store_and_forward": None,
        "Payment": {
            "type": "Credit",
            "amt": 20.5,
            "surcharge": 0,
            "mta_tax": None,
            "tip": 9,
            "tolls": 4.15,
			"status": "cancelled"
        },
        "Passenger_Count": 2,
        "passengers": [
            {"name": "John", "rating": 4.4},
            {"name": "Jack", "rating": 3.6}
        ],
        "Stops": [
            {"lon": -73.6, "lat": 40.6},
            {"lon": -73.5, "lat": 40.5}
        ]
    },
]

In [19]:
# create the pipeline with dlt
# We now use duckdb, but you can switch to Bigquery later
merge_pipeline = dlt.pipeline(pipeline_name='taxi_merge', destination='duckdb', dataset_name='taxi_rides')

# run the pipeline with default settings, and capture the outcome
info = merge_pipeline.run(data=new_data, 
					table_name="taxi_trips", 
					write_disposition="merge",
                    primary_key="record_hash")

# show the outcome
print(info)

Pipeline taxi_merge load step completed in 0.78 seconds
1 load package(s) were loaded to destination duckdb and into dataset taxi_rides
The duckdb destination used duckdb:////home/jdelzio/data-engineering-zoomcamp/workshop_1/taxi_merge.duckdb location to store data
Load package 1707966027.520352 is LOADED and contains no failed jobs
