# Module 3: Vector-Search Homework

# |

# 

## Q1. Getting the embeddings model

In [7]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
user_question = "I just discovered the course. Can I still join it?"

question_embeddings = embedding_model.encode(user_question)

len(question_embeddings)

768

Question: What's the first value of the resulting vector?

In [17]:
print(f"The first value of the resulting vector is {question_embeddings[0]:.2f}")

The first value of the resulting vector is 0.08


# 

# 

## Q2. Creating the embeddings

In [25]:
import requests 
import numpy as np

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [27]:
embeddings = []

for document in documents:

    qa_text = f"{document['question']} {document['text']}"
    
    qa_text_embeddings = embedding_model.encode(qa_text)

    embeddings.append(qa_text_embeddings)

In [28]:
X = np.array(embeddings)


Question: What's the shape of X?

In [30]:
print(f"The shape of X is {X.shape}")

The shape of X is (948, 768)


# 

# 

## Q3. Search

In [34]:
scores = X.dot(question_embeddings)

Question: What's the highest score in the results?

In [35]:
print(f"The highest score in the results is {np.max(scores):.2f}")

The highest score in the results is 0.65


## 

## 

### Vector search

In [36]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(question_embeddings, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'Star the r

## 

## 

## Q4. Hit-rate for our search engine

In [54]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm


es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [55]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)

100%|██████████| 948/948 [04:57<00:00,  3.19it/s]


In [56]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:19<00:00, 47.93it/s]


In [38]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [39]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [40]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [44]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [45]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [49]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [57]:
# Evaluate the search engine
evaluation_results = evaluate(ground_truth, question_vector_knn)
print(f"Hit rate: {evaluation_results['hit_rate']}")
print(f"MRR: {evaluation_results['mrr']}")

100%|██████████| 1830/1830 [03:33<00:00,  8.58it/s]

Hit rate: 0.8076502732240437
MRR: 0.6985519125683067





## 

## 

## Q5. Indexing with Elasticsearch

In [60]:
def elastic_search_knn(field, vector, course=None):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }
    if course:
        knn["filter"] = {
            "term": {
                "course": course
            }
        }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

Document ID with the highest score: ee58a693


In [66]:
results = elastic_search_knn('question_vector', question_embeddings, course=None)

highest_score_doc = results[0]

highest_score_doc


{'question': 'The course has already started. Can I still join it?',
 'course': 'machine-learning-zoomcamp',
 'section': 'General course-related questions',
 'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'id': 'ee58a693'}

Question: What's the ID of the document with the highest score?


In [67]:
print(f"The ID of the document with the highest score is the ID {highest_score_doc['id']}")

The ID of the document with the highest score is the ID ee58a693


## 

## 

## Q6. Hit-rate for Elasticsearch

In [68]:
# Function for question vector search using Elasticsearch
def question_vector_search_elastic(q):
    question = q['question']
    course = q['course']
    v_q = embedding_model.encode(question)
    return elastic_search_knn('question_vector', v_q, course)

# Evaluate the Elasticsearch-based search engine
evaluation_results_elastic = evaluate(ground_truth, question_vector_search_elastic)
print(f"Hit rate for Elasticsearch: {evaluation_results_elastic['hit_rate']}")
print(f"MRR for Elasticsearch: {evaluation_results_elastic['mrr']}")

 28%|██▊       | 508/1830 [00:55<02:14,  9.80it/s]