In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
from qdrant_client import QdrantClient, models
qd_client = QdrantClient("http://localhost:6333")
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "zoomcamp-faq"

qd_client.delete_collection(collection_name=collection_name)
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [4]:
# Question 1
from fastembed import TextEmbedding
import numpy as np

# Load the model
model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en", cache_dir=".cache")

# Embed the query
query = "I just discovered the course. Can I join now?"
query_embedding = list(model.embed([query]))[0]

print("Min value:", np.min(query_embedding))

Min value: -0.11726373551188797


In [5]:
# Question 2
# Embed the document
doc = "Can I still join the course after the start date?"
doc_vector = list(model.embed([doc]))[0]

# Compute cosine similarity
similarity = np.dot(query_embedding, doc_vector)
print("Cosine similarity:", similarity)

Cosine similarity: 0.9008528856818037


In [6]:
# Question 3
# Embed document texts
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

doc_texts = [doc["text"] for doc in documents]
doc_embeddings = list(model.embed(doc_texts))

# Cosine similarity against the query
similarities = [np.dot(query_embedding, doc_vec) for doc_vec in doc_embeddings]

# Find the index of the most similar document
most_similar_document_index = np.argmax(similarities)

print("Most similar document index:", most_similar_document_index)

Most similar document index: 1


In [7]:
# Question 4
# Concatenate question + text
full_texts = [doc["question"] + " " + doc["text"] for doc in documents]

# Embed the full texts
full_embeddings = list(model.embed(full_texts))

# Compute cosine similarity using dot product
V_full = np.array(full_embeddings)
q = np.array(query_embedding)

similarities_full = V_full.dot(q)

# Find most similar
best_index_full = np.argmax(similarities_full)

print("Most similar document index:", best_index_full)

## Answer to reasoning question: Yes it is different than question 3. When we use both question and answer(i.e. text) in the embedding, it gave model more context. The question or the answer likely had wording similar to the query, increasing semantic similarity.

Most similar document index: 0


In [8]:
# Question 5
from fastembed import TextEmbedding

# Get the list of supported models
models = TextEmbedding.list_supported_models()

# Extract just the dimensionalities
dims = [m["dim"] for m in models]

# Get the smallest one
print("Smallest dimension available:", min(dims))

Smallest dimension available: 384


In [10]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models
from qdrant_client.models import PointStruct
import requests

# Load embedding model
embedding_model = TextEmbedding(model_name="BAAI/bge-small-en", cache_dir=".cache")
EMBEDDING_DIMENSIONALITY = 384  # bge-small-en output size

# Load ML Zoomcamp FAQ documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
documents_raw = requests.get(docs_url).json()

documents = []
for course in documents_raw:
    if course["course"] != "machine-learning-zoomcamp":
        continue
    for doc in course["documents"]:
        doc["course"] = course["course"]
        documents.append(doc)

# Create in-memory Qdrant client
qd_client = QdrantClient(":memory:")

# Create Qdrant collection
collection_name = "mlzoomcamp_faq"
qd_client.delete_collection(collection_name=collection_name)
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

# Insert document vectors into Qdrant
points = []
for idx, doc in enumerate(documents):
    full_text = doc["question"] + " " + doc["text"]
    vector = list(embedding_model.embed([full_text]))[0]
    points.append(PointStruct(id=idx, vector=vector, payload=doc))

qd_client.upsert(collection_name=collection_name, points=points)

# Embed the query
query = "I just discovered the course. Can I join now?"
query_vector = list(embedding_model.embed([query]))[0]

# Search the Qdrant collection
search_result = qd_client.query_points(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=1,
    with_payload=True  # so you get back the question/text info
)

# Show top result
top_result = search_result[0]
print("Top document score:", top_result.score)
print("Top document:", top_result.payload["question"])


AssertionError: Unknown arguments: ['query_vector']