# From REST to reasoning: ingest, index, and query with dlt and Cognee

In [1]:
import dlt
print(dlt.__version__)

1.12.3


In [6]:
import requests

# get data
@dlt.resource
def zoomcamp_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            yield doc

In [4]:
from dlt.destinations import qdrant

# we tell dlt (and Qdrant) to create a folder with our data, and the name for it will be db.qdrant
qdrant_destination = qdrant(
  qd_path="db.qdrant", 
)

In [7]:
pipeline = dlt.pipeline(
    pipeline_name="zoomcamp_pipeline",
    destination=qdrant_destination,
    dataset_name="zoomcamp_tagged_data"

)
load_info = pipeline.run(zoomcamp_data())
print(pipeline.last_trace)

Run started at 2025-07-03 15:38:58.413240+00:00 and COMPLETED in 4.67 seconds with 4 steps.
Step extract COMPLETED in 0.68 seconds.

Load package 1751557138.9573472 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.08 seconds.
Normalized data for the following tables:
- _dlt_pipeline_state: 1 row(s)
- zoomcamp_data: 948 row(s)

Load package 1751557138.9573472 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 3.37 seconds.
Pipeline zoomcamp_pipeline load step completed in 3.35 seconds
1 load package(s) were loaded to destination qdrant and into dataset zoomcamp_tagged_data
The qdrant destination used /Users/iuliia/projects/llm_zoomcamp/2.1_ws_agents/db.qdrant location to store data
Load package 1751557138.9573472 is LOADED and contains no failed jobs

Step run COMPLETED in 4.66 seconds.
Pipeline zoomcamp_pipeline load step completed in 3.35 seconds
1 load package(s) were loa

**How many rows were inserted into the zoomcamp_data collection?**

Normalized data for the following tables:
- _dlt_pipeline_state: 1 row(s)
- zoomcamp_data: 948 row(s)

**When inserting the data, an embedding model was used. Which one?**

You can find this out by inspecting the meta.json file created in the target folder.

In [8]:
import json

In [10]:
with open('db.qdrant/meta.json', 'r') as f:
    data = json.load(f)

In [13]:
data['collections']['zoomcamp_tagged_data']

{'vectors': {'fast-bge-small-en': {'size': 384,
   'distance': 'Cosine',
   'hnsw_config': None,
   'quantization_config': None,
   'on_disk': None,
   'datatype': None,
   'multivector_config': None}},
 'shard_number': None,
 'sharding_method': None,
 'replication_factor': None,
 'write_consistency_factor': None,
 'on_disk_payload': None,
 'hnsw_config': None,
 'wal_config': None,
 'optimizers_config': None,
 'init_from': None,
 'quantization_config': None,
 'sparse_vectors': None,
 'strict_mode_config': None}