**Q1**

In [1]:
from sentence_transformers import SentenceTransformer

In [2]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

user_question = "I just discovered the course. Can I still join it?"
user_question_embedding = embedding_model.encode(user_question)

print(user_question_embedding[0])

0.078222655


In [3]:
len(embedding_model.encode("I just discovered the course. Can I still join it?"))

768

**Q2**

Create embeddings for documents

In [4]:
import requests
import numpy as np

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Filter documents for "machine-learning-zoomcamp"
filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

# Create embeddings
embeddings = []
for doc in filtered_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)
print(X.shape)

(375, 768)


**Q3**

Compute the cosine similarity between the query vector and the document embeddings. 
The vectors returned from the embedding model are already normalized, so we can simply multiply the matrix `X` by the query vector `v` to get the similarity scores.

The highest score in the results indicates the most similar document to the query.

In [5]:
v = user_question_embedding
scores = X.dot(v)
print(np.max(scores))

0.6506573


**Q4**

Compute the similarity between a query vector and all the embeddings.

We load the ground truth dataset, which contains the correct document IDs for a set of queries.

In [6]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')


In [7]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores, num_results)[:num_results]
        top_idx = idx[np.argsort(-scores[idx])]  # Sort the top `num_results` scores
        return [self.documents[i] for i in top_idx]

# Create the search engine instance with the filtered documents and embeddings
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)


np.argpartition: Partially sorts the array to get the top num_results elements. These elements are not guaranteed to be in order.

np.argsort on the top results: Fully sorts these top elements to ensure they are in the correct order. This sorting is crucial for MRR, which depends on the exact ranking of the results.

To achieve correct MRR, we should sort the top results to ensure they are in the correct order.

To evaluate the performance of our search engine, we use the hit-rate metric. 



In [8]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
    return total_score / len(relevance_total)


In [9]:
from tqdm import tqdm
import time

def evaluate_custom_search_engine(ground_truth, search_function):
    relevance_total = []
    start_time = time.time()
    
    for q in tqdm(ground_truth, desc="Evaluating search engine"):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    hit_rate_result = hit_rate(relevance_total)
    mrr_result = mrr(relevance_total)
    print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")
    return {
        'hit_rate': hit_rate_result,
        'mrr': mrr_result,
    }


In [10]:
def search_query(q):
    question = q['question']
    v_query = embedding_model.encode(question)
    return search_engine.search(v_query, num_results=5)

In [11]:
# # Create the search engine instance with the filtered documents and embeddings
# search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)

# Evaluate the search engine
evaluation_results_custom  = evaluate_custom_search_engine(ground_truth, search_query)
print(f"Hit rate: {evaluation_results_custom['hit_rate']}")
print(f"MRR: {evaluation_results_custom['mrr']}")

Evaluating search engine: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:29<00:00, 20.36it/s]

Evaluation completed in 89.89 seconds
Hit rate: 0.9398907103825137
MRR: 0.8516484517304189





**Q5**

Run Elasticsearch from Docker

Index the documents with Elasticsearch with specific settings and mappings, including dense vector fields for the embeddings.

After indexing, perform a search for the same query from Q1 using Elasticsearch. The ID of the document with the highest score is noted.

In [12]:
from elasticsearch import Elasticsearch
from tqdm import tqdm

es_client = Elasticsearch('http://localhost:9200')

index_name = "course-questions"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [13]:
for doc in tqdm(filtered_documents, desc="Indexing documents"):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_text_vector'] = embedding_model.encode(qt).tolist()

    es_client.index(index=index_name, id=doc['id'], body=doc)


Indexing documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 375/375 [01:16<00:00,  4.93it/s]


In [14]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(index=index_name, body=search_query)
    return es_results['hits']['hits']

# Perform the search
query = 'I just discovered the course. Can I still join it?'
v_q = embedding_model.encode(query).tolist()
response = elastic_search_knn('question_text_vector', v_q, 'machine-learning-zoomcamp')

# Print the results
for hit in response:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}, Question: {hit['_source']['question']}")


ID: ee58a693, Score: 0.82532895, Question: The course has already started. Can I still join it?
ID: 0a278fb2, Score: 0.73585373, Question: I just joined. What should I do next? How can I access course materials?
ID: 6ba259b1, Score: 0.7295, Question: I filled the form, but haven't received a confirmation email. Is it normal?
ID: 9f261648, Score: 0.72849524, Question: Can I do the course in other languages, like R or Scala?
ID: e7ba6b8a, Score: 0.7252791, Question: The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?


**Q6**

Evaluate the hit-rate for Elasticsearch. 

Load the ground truth dataset again and use a function to perform the search with Elasticsearch. 

The hit-rate is calculated in the same way as before

In [15]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
print(ground_truth[0])

{'question': 'Where can I sign up for the course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}


In [16]:
def calculate_hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def calculate_mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
    return total_score / len(relevance_total)

In [17]:
def evaluate_elasticsearch(ground_truth):
    relevance_total = []
    start_time = time.time()
    
    for q in tqdm(ground_truth, desc="Evaluating Elasticsearch"):
        question = q['question']
        relevant_doc_id = q['document']
        query_embedding = embedding_model.encode(question).tolist()
        results = elastic_search_knn('question_text_vector', query_embedding, q['course'])
        
        relevance = []
        for res in results:
            # Debug print for each result
            if '_source' in res and 'id' in res['_source']:
                relevance.append(res['_source']['id'] == relevant_doc_id)
            else:
                relevance.append(False)
                
        relevance_total.append(relevance)
    
    hit_rate_result = calculate_hit_rate(relevance_total)
    mrr_result = calculate_mrr(relevance_total)
    print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")
    return {
        'hit_rate': hit_rate_result,
        'mrr': mrr_result,
    }

# Evaluate Elasticsearch
evaluation_results_elastic = evaluate_elasticsearch(ground_truth)
print(f"Elasticsearch - Hit rate: {evaluation_results_elastic['hit_rate']}")
print(f"Elasticsearch - MRR: {evaluation_results_elastic['mrr']}")

Evaluating Elasticsearch: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:27<00:00, 20.95it/s]

Evaluation completed in 87.35 seconds
Elasticsearch - Hit rate: 0.9398907103825137
Elasticsearch - MRR: 0.8504462659380693



