# llm-zoomcamp/ Homework 2 / Vector Search

### Q1. Embedding the query

In [1]:
import numpy as np
from fastembed import TextEmbedding

In [2]:
query = "I just discovered the course. Can I join now?"

In [3]:
# Use model : "jinaai/jina-embeddings-v2-small-en"
model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

In [4]:
# Generate embedding for the query
embeddings = list(model.embed([query]))
query_vector = embeddings[0]

# Compute statistics
vector_size = len(query_vector)
min_value = np.min(query_vector)

print(f"Vector size: {vector_size}")
print(f"Minimum value in vector: {min_value:.6f}")

Vector size: 512
Minimum value in vector: -0.117264


#### Cosine similarity

In [5]:
import numpy as np
np.linalg.norm(query_vector)

np.float64(1.0)

In [6]:
query_vector.dot(query_vector)

np.float64(1.0000000000000002)

### Q2. Cosine similarity with another vector

In [7]:
import numpy as np
from fastembed import TextEmbedding
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def find_cosine_similarity(model_name, querry, doc):
  
    # Load embedding model
    model = TextEmbedding(model_name=model_name)

    # Generate embeddings
    query_vector = list(model.embed([query]))[0]
    doc_vector = list(model.embed([doc]))[0]

    # Convert to 2D for sklearn
    query_vector_2d = np.array(query_vector).reshape(1, -1)
    doc_vector_2d = np.array(doc_vector).reshape(1, -1)

    # Compute cosine similarity
    similarity = cosine_similarity(query_vector_2d, doc_vector_2d)[0][0]

    return similarity

In [9]:
#model
model_name="jinaai/jina-embeddings-v2-small-en"
# Sentences to compare
query = "I just discovered the course. Can I join now?"
doc = "Can I still join the course after the start date?"

In [10]:
similarity= find_cosine_similarity(model_name, query,doc)
print(f"Cosine similarity: {similarity:.4f}")

Cosine similarity: 0.9009


### Q3. Ranking by cosine


In [11]:
import numpy as np
from fastembed import TextEmbedding

In [12]:
def find_ranking_by_cosine(model_name, query,documents):
    
    # Initialize the embedding model
    model = TextEmbedding(model_name=model_name)

    # Compute the query embedding
    query_vector = np.array(list(model.embed([query]))[0])

    # Compute document embeddings
    doc_texts = [doc["text"] for doc in documents]
    doc_embeddings = list(model.embed(doc_texts))

    # Convert to numpy matrix
    V = np.array(doc_embeddings)  # shape: (5, 512)

    # Normalize vectors (fastembed returns normalized vectors, but for safety)
    query_vector = query_vector / np.linalg.norm(query_vector)
    V = V / np.linalg.norm(V, axis=1, keepdims=True)

    # Compute cosine similarity using dot product
    cosine_similarities = np.dot(V, query_vector)

    # Find the index with the highest similarity
    best_index = np.argmax(cosine_similarities)

    return best_index;
    

In [13]:
# Query
query = "I just discovered the course. Can I join now?"

# Documents
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'}
]

In [14]:

best_index = find_ranking_by_cosine(model_name, query,documents)

print(f"Most similar document index: {best_index}")

Most similar document index: 1


### Q4. Ranking by cosine, version two

In [15]:
import numpy as np
from fastembed import TextEmbedding

In [16]:
def find_ranking_by_cosine_v2(model_name, query,documents):
    
    # Initialize the embedding model
    model = TextEmbedding(model_name=model_name)

    # Compute the query embedding
    query_vector = np.array(list(model.embed([query]))[0])

    # Create full_text fields and embed them
    full_texts = [doc['question'] + ' ' + doc['text'] for doc in documents]
    doc_embeddings = list(model.embed(full_texts))

    # Convert to numpy matrix
    V = np.array(doc_embeddings)

    # Normalize vectors (fastembed returns normalized vectors, but for safety)
    query_vector = query_vector / np.linalg.norm(query_vector)
    V = V / np.linalg.norm(V, axis=1, keepdims=True)

    # Compute cosine similarity using dot product
    cosine_similarities = np.dot(V, query_vector)

    # Find the index with the highest similarity
    best_index = np.argmax(cosine_similarities)

    return best_index;    

In [17]:
# Query
query = "I just discovered the course. Can I join now?"

# Documents
documents = [
    {
        'question': 'Course - Can I still join the course after the start date?',
        'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."
    },
    {
        'question': 'Course - Can I follow the course after it finishes?',
        'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'
    },
    {
        'question': 'Course - When will the course start?',
        'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."
    },
    {
        'question': 'Course - What can I do before the course starts?',
        'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.'
    },
    {
        'question': 'How can we contribute to the course?',
        'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'
    }
]


In [18]:

best_index = find_ranking_by_cosine_v2(model_name, query,documents)

print(f"Most similar document index: {best_index}")

Most similar document index: 0


#### In Q3, embeddings were based only on the document text, so the top match (index 1) reflected general content similarity.
#### In Q4, using question + text provided better context, making index 0 the best match as it aligned more directly with the query’s intent.


### Q5. Selecting the embedding model

In [21]:
from fastembed import TextEmbedding

In [22]:
def get_fastembed_model_dimentionality(model_name):
    model = TextEmbedding(model_name=model_name)
    embedding = list(model.embed(["your text here"]))[0]
    return len(embedding)

In [23]:
model_name = "BAAI/bge-small-en"
dimenlen= get_fastembed_model_dimentionality(model_name)
print(f"dimensionality for model: {dimenlen}")      

dimensionality for model: 384


### Q6. Indexing with qdrant (2 points)


In [24]:
import requests
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import numpy as np
from fastembed import TextEmbedding

In [25]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [26]:
# prepare embeddings
model = TextEmbedding(model_name="BAAI/bge-small-en")
full_texts = [doc['question'] + ' ' + doc['text'] for doc in documents]
embeddings = list(model.embed(full_texts)) 

In [27]:
# create quadrant collection and insert points
client = QdrantClient(host="localhost", port=6333)

collection_name = "mlzc_bge_small"
if not client.collection_exists(collection_name=collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

points = [
    PointStruct(id=i, vector=embeddings[i].tolist(), payload=documents[i])
    for i in range(len(documents))
]
client.upsert(collection_name=collection_name, points=points)


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [28]:
# qyeey wuth prompt
query = "I just discovered the course. Can I join now?"
q_vec = list(model.embed([query]))[0].tolist()

results = client.search(
    collection_name=collection_name,
    query_vector=q_vec,
    limit=1
)

top_score = results[0].score
print(f"Top result score: {top_score:.4f}")


Top result score: 0.8703


  results = client.search(
