In [56]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding
import json


In [3]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [8]:
model_handle = "jinaai/jina-embeddings-v2-small-en"


EMBEDDING_DIMENSIONALITY = 512

# Define the collection name
collection_name = "zoomcamp-rag"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [14]:
def search(query, limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
from fastembed.embedding import TextEmbedding
import numpy as np

# Initialize embedding model
model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# Your query
query = ["I just discovered the course. Can I join now?"]

# Generate embeddings
embedding_generator = model.embed(query)

# Convert generator to list (only one item in this case)
embedding = list(embedding_generator)[0]

# Optional: convert to NumPy array
embedding_np = np.array(embedding)

# Confirm shape
print(f"Shape: {embedding_np.shape}")




Shape: (512,)


In [19]:
embedding_np.min()

np.float64(-0.11726373551188797)

In [21]:
from qdrant_client.http import models
from uuid import uuid4

# Define the document text
doc_text = "Can I still join the course after the start date?"

# Create a new point with a UUID
point = models.PointStruct(
    id=str(uuid4()),  # use UUID for safety
    vector=models.Document(
        text=doc_text,
        model="jinaai/jina-embeddings-v2-small-en"
    ),
    payload={
        "text": doc_text,
        "section": "General Enrollment Questions",
        "course": "mlops-zoomcamp"
    }
)

# Insert the point into the collection
client.upsert(
    collection_name=collection_name,
    points=[point]
)


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
def search(query, limit=1):
    return client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="jinaai/jina-embeddings-v2-small-en"
        ),
        limit=limit,
        with_payload=True
    )

# Run search with your query
query_text = "I just discovered the course. Can I join now?"
result = search(query_text)


In [24]:
# The top match
top_match = result.points[0]

# Print similarity and content
print(f"Cosine similarity: {top_match.score:.4f}")


Cosine similarity: 0.9009


In [52]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from fastembed.embedding import TextEmbedding
import numpy as np
from uuid import uuid4

# Step 1: Set up Qdrant client and model
client = QdrantClient(host="localhost", port=6333)
collection_name = "zoomcamp-rag"
model_handle = "jinaai/jina-embeddings-v2-small-en"

# Recreate the collection (optional safety step)
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,
        distance=models.Distance.COSINE
    )
)

# Step 2: Input documents
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

# Step 3: Convert to Qdrant Points
points = []
for i, doc in enumerate(documents):
    points.append(models.PointStruct(
        id=i,
        vector=models.Document(
            text=doc['text'],
            model=model_handle
        ),
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course']
        }
    ))

# Step 4: Insert documents into Qdrant
client.upsert(
    collection_name=collection_name,
    points=points
)


  client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [53]:
def search(query_text, limit=5):
    return client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query_text,
            model=model_handle
        ),
        limit=limit,
        with_payload=True
    )

# Query
query = "I just discovered the course. Can I join now?"
results = search(query)


In [55]:
for i, point in enumerate(results.points):
    print(f"\nRank {i+1}")
    print(f"Cosine similarity: {point.score:.4f}")
    print(f"Retrieved text:\n{point.payload['text']}")



Rank 1
Cosine similarity: 0.8182
Retrieved text:
Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Rank 2
Cosine similarity: 0.8085
Retrieved text:
The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.

Rank 3
Cosine similarity: 0.7630
Retrieved text:
Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that 

In [57]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from fastembed.embedding import TextEmbedding
import numpy as np
from uuid import uuid4

# Step 1: Set up Qdrant client and model
client = QdrantClient(host="localhost", port=6333)
collection_name = "zoomcamp-rag"
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = TextEmbedding(model_name=model_handle)

# Step 2: Define documents with full_text
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions',
     'question': 'Course - Can I still join the course after the start date?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions',
     'question': 'Course - Can I follow the course after it finishes?',
     'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first “Office Hours'' live.\nSubscribe to course public Google Calendar...",
     'section': 'General course-related questions',
     'question': 'Course - When will the course start?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
     'section': 'General course-related questions',
     'question': 'Course - What can I do before the course starts?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
     'section': 'General course-related questions',
     'question': 'How can we contribute to the course?',
     'course': 'data-engineering-zoomcamp'}
]

# Step 3: Concatenate full_text and store it
for doc in documents:
    doc['full_text'] = doc['question'] + ' ' + doc['text']

# Step 4: Recreate the collection
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,
        distance=models.Distance.COSINE
    )
)

# Step 5: Embed and upload full_text documents
points = []
for i, doc in enumerate(documents):
    points.append(models.PointStruct(
        id=i,
        vector=models.Document(
            text=doc['full_text'],
            model=model_handle
        ),
        payload={
            "text": doc['text'],
            "question": doc['question'],
            "full_text": doc['full_text'],
            "section": doc['section'],
            "course": doc['course']
        }
    ))

client.upsert(
    collection_name=collection_name,
    points=points
)

# Step 6: Define search function
def search(query_text, limit=5):
    return client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query_text,
            model=model_handle
        ),
        limit=limit,
        with_payload=True
    )

# Step 7: Run search and print results
query = "I just discovered the course. Can I join now?"
results = search(query)

for i, point in enumerate(results.points):
    print(f"\nRank {i+1}")
    print(f"Cosine similarity: {point.score:.4f}")
    print(f"Matched Question:\n{point.payload['question']}")
    print(f"Retrieved Text:\n{point.payload['text']}")


  client.recreate_collection(



Rank 1
Cosine similarity: 0.8515
Matched Question:
Course - Can I still join the course after the start date?
Retrieved Text:
Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Rank 2
Cosine similarity: 0.8437
Matched Question:
Course - Can I follow the course after it finishes?
Retrieved Text:
Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Rank 3
Cosine similarity: 0.8200
Matched Question:
Course - When will the course start?
Retrieved Text:
The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The

In [61]:
EMBEDDING_DIMENSIONALITY = 384

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

{
  "model": "BAAI/bge-small-en",
  "sources": {
    "hf": "Qdrant/bge-small-en",
    "url": "https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.13,
  "additional_files": [],
  "dim": 384,
  "tasks": {}
}
{
  "model": "BAAI/bge-small-en-v1.5",
  "sources": {
    "hf": "qdrant/bge-small-en-v1.5-onnx-q",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.067,
  "additional_files": [],
  "dim": 384,
  "tasks": {}
}
{
  "model": "snowflake/snowflake-arctic-embed-xs",
  "sou

In [64]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        doc['full_text'] = doc['question'] + ' ' + doc['text']
        documents.append(doc)
        
		

In [65]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from fastembed.embedding import TextEmbedding
from uuid import uuid4

# Initialize Qdrant and FastEmbed
client = QdrantClient(host="localhost", port=6333)
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "ml-zoomcamp-rag"
model = TextEmbedding(model_name=model_handle)

# Step 2: Recreate Qdrant collection
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,
        distance=models.Distance.COSINE
    )
)

# Step 3: Convert documents to Qdrant points
points = []
for i, doc in enumerate(documents):
    points.append(models.PointStruct(
        id=i,
        vector=models.Document(
            text=doc['full_text'],
            model=model_handle
        ),
        payload={
            "text": doc['text'],
            "question": doc['question'],
            "full_text": doc['full_text'],
            "section": doc['section'],
            "course": doc['course']
        }
    ))

# Step 4: Insert into Qdrant
client.upsert(
    collection_name=collection_name,
    points=points
)

# Step 5: Search function
def search(query_text, limit=5):
    return client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query_text,
            model=model_handle
        ),
        limit=limit,
        with_payload=True
    )

# Step 6: Run query and show results
query = "I just discovered the course. Can I join now?"
results = search(query)

for i, point in enumerate(results.points):
    print(f"\nRank {i+1}")
    print(f"Cosine similarity: {point.score:.4f}")
    print(f"Matched Question:\n{point.payload['question']}")
    print(f"Retrieved Text:\n{point.payload['text']}")


  client.recreate_collection(



Rank 1
Cosine similarity: 0.8621
Matched Question:
The course has already started. Can I still join it?
Retrieved Text:
Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.

Rank 2
Cosine similarity: 0.8389
Matched Question:
I just joined. What should I do next? How can I access course materials?
Retrieved Text:
Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.
Click on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course y