In [1]:
!pip install -q "dlt[qdrant]" "qdrant-client[fastembed]"

In [2]:
!pip show dlt

Name: dlt
Version: 1.13.0
Summary: dlt is an open-source python-first scalable data loading library that does not require any backend to run.
Home-page: https://github.com/dlt-hub
Author: 
Author-email: "dltHub Inc." <services@dlthub.com>
License-Expression: Apache-2.0
Location: /usr/local/python/3.12.1/lib/python3.12/site-packages
Requires: click, fsspec, gitpython, giturlparse, hexbytes, humanize, jsonpath-ng, orjson, packaging, pathvalidate, pendulum, pluggy, pytz, pyyaml, requests, requirements-parser, rich-argparse, semver, setuptools, simplejson, sqlglot, tenacity, tomlkit, typing-extensions, tzdata
Required-by: 


In [3]:
import dlt
import requests
from dlt.destinations import qdrant

In [4]:
@dlt.resource
def zoomcamp_data():
    """
    Reads FAQ data from a remote JSON file.
    """
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            yield doc

In [5]:
qdrant_destination = qdrant(
  qd_path="db.qdrant",
)

In [6]:
pipeline = dlt.pipeline(
    pipeline_name="zoomcamp_pipeline",
    destination=qdrant_destination,
    dataset_name="zoomcamp_tagged_data"
)

load_info = pipeline.run(zoomcamp_data())

# Print the trace of the last pipeline run
print(pipeline.last_trace)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/133M [00:00<?, ?B/s]

Run started at 2025-07-13 15:18:27.118864+00:00 and COMPLETED in 11.27 seconds with 4 steps.
Step extract COMPLETED in 0.42 seconds.

Load package 1752419913.2056346 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.09 seconds.
Normalized data for the following tables:
- _dlt_pipeline_state: 1 row(s)
- zoomcamp_data: 948 row(s)

Load package 1752419913.2056346 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 4.68 seconds.
Pipeline zoomcamp_pipeline load step completed in 4.66 seconds
1 load package(s) were loaded to destination qdrant and into dataset zoomcamp_tagged_data
The qdrant destination used /workspaces/llm-zoomcamp/workshop/db.qdrant location to store data
Load package 1752419913.2056346 is LOADED and contains no failed jobs

Step run COMPLETED in 11.27 seconds.
Pipeline zoomcamp_pipeline load step completed in 4.66 seconds
1 load package(s) were loaded to destina

In [7]:
!cat db.qdrant/meta.json

{"collections": {"zoomcamp_tagged_data": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}, "zoomcamp_tagged_data__dlt_version": {"vectors": {"fast-bge-small-en": {"size": 384, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null,

In [32]:
import json

embedding_model = list(json.load(open('db.qdrant/meta.json', 'r'))['collections']['zoomcamp_tagged_data_zoomcamp_data']['vectors'].keys())[0]

embedding_model

'fast-bge-small-en'