In [1]:
import dlt

@dlt.resource(table_name="foo_data")
def foo():
    for i in range(10):
        yield {"id": i, "name": f"This is item {i}"}

pipeline = dlt.pipeline(
    pipeline_name="python_data_example",
    destination="duckdb",
)

load_info = pipeline.run(foo)

# print load info and the "foo_data" table as dataframe
print(load_info)


Pipeline python_data_example load step completed in 0.94 seconds
1 load package(s) were loaded to destination duckdb and into dataset python_data_example_dataset
The duckdb destination used duckdb:////home/endie/Projects/LLM-Projects/LLM-Zoomcamp/LLM-workSpace/intro/python_data_example.duckdb location to store data
Load package 1752127979.5875778 is LOADED and contains no failed jobs


In [2]:
print(pipeline.dataset().foo_data.df())

   id            name        _dlt_load_id         _dlt_id
0   0  This is item 0  1752127979.5875778  tRyyO+ZP7OHuoA
1   1  This is item 1  1752127979.5875778  3cZfNvXKd2fWuQ
2   2  This is item 2  1752127979.5875778  4rSTtkksavyDyQ
3   3  This is item 3  1752127979.5875778  RRrbsufoboZhoQ
4   4  This is item 4  1752127979.5875778  bbcmphvzB/1ZRw
5   5  This is item 5  1752127979.5875778  84Ix7cSO7hZi5g
6   6  This is item 6  1752127979.5875778  nnkPC1nfrMCN8g
7   7  This is item 7  1752127979.5875778  piXsAAZIXEGNag
8   8  This is item 8  1752127979.5875778  JvUu8WXIqrB8nA
9   9  This is item 9  1752127979.5875778  tVsQ2iwtZM8ROA


In [3]:
!dlt --version

[39mdlt 1.13.0[0m


In [4]:
!pip install -q "dlt[qdrant]" "qdrant-client[fastembed]"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import requests

@dlt.resource
def zoomcamp_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            yield doc

## Question 2. dlt pipeline
___
Now let's create a pipeline.

We need to define a destination for that. Let's use the qdrant one:



In [8]:
from dlt.destinations import qdrant

qdrant_destination = qdrant(
  qd_path="db.qdrant", 
)

**In this case, we tell dlt (and Qdrant) to create a folder with our data, and the name for it will be db.qdrant
Let's run it:**


In [12]:
pipeline = dlt.pipeline(
    pipeline_name="zoomcamp_pipeline",
    destination=qdrant_destination,
    dataset_name="zoomcamp_tagged_data"

)
load_info = pipeline.run(zoomcamp_data())
print(pipeline.last_trace)

Run started at 2025-07-10 06:32:37.610702+00:00 and COMPLETED in 32.20 seconds with 4 steps.
Step extract COMPLETED in 0.82 seconds.

Load package 1752129159.2188015 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.36 seconds.
Normalized data for the following tables:
- _dlt_pipeline_state: 1 row(s)
- zoomcamp_data: 948 row(s)

Load package 1752129159.2188015 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 29.46 seconds.
Pipeline zoomcamp_pipeline load step completed in 29.37 seconds
1 load package(s) were loaded to destination qdrant and into dataset zoomcamp_tagged_data
The qdrant destination used /home/endie/Projects/LLM-Projects/LLM-Zoomcamp/LLM-workSpace/intro/db.qdrant location to store data
Load package 1752129159.2188015 is LOADED and contains no failed jobs

Step run COMPLETED in 32.20 seconds.
Pipeline zoomcamp_pipeline load step completed in 29.37 seconds
1 lo

**{"collections": {"zoomcamp_tagged_data": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}, "zoomcamp_tagged_data__dlt_pipeline_state": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}, "zoomcamp_tagged_data__dlt_loads": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}, "zoomcamp_tagged_data_zoomcamp_data": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}, "zoomcamp_tagged_data__dlt_version": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}}, "aliases": {}}**