### Question 1. Getting the embeddings model: First value (1 point)

In [210]:
from sentence_transformers import SentenceTransformer
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [211]:
user_question = "I just discovered the course. Can I still join it?"

embedding_vector = embedding_model.encode(user_question)

first_value = embedding_vector[0]
print(first_value)

0.07822261


In [212]:
import requests

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [213]:
filtered_documents = [doc for doc in documents if doc.get('course') == 'machine-learning-zoomcamp']

num_filtered_documents = len(filtered_documents)
print(f"Number of documents for 'machine-learning-zoomcamp': {num_filtered_documents}")

Number of documents for 'machine-learning-zoomcamp': 375


### Question 2. Creating the embeddings: Shape (1 point)

In [214]:
import numpy as np

embeddings = []

for doc in filtered_documents:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

# Convert the list of embeddings to a numpy array
X = np.array(embeddings)

# Print the shape of the resulting matrix
print(X.shape)

(375, 768)


### Question 3. Search: Highest score (1 point)

In [215]:
# Generate the embedding for the user question
v = embedding_model.encode(user_question)

# Compute cosine similarity
scores = X.dot(v)

# Find the highest score
highest_score = np.max(scores)
print(highest_score)

0.6506574


### Question 4. Hit-rate for our search engine (1 point)

In [230]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [234]:
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
result = search_engine.search(v, num_results=5)

In [105]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [173]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [174]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [152]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Question 5. Indexing with Elasticsearch: ID with the highest score (1 point)

### Question 6. Hit-rate for Elasticsearch (1 point)