In [29]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-distilbert-cos-v1"

embedding_model = SentenceTransformer(model_name)


In [30]:
user_query = "I just discovered the course. Can I still join it?"

v = embedding_model.encode(user_query)

## Question 1. Getting the embeddings model.
First value:

In [31]:
question_embedding[0]

0.07822266

In [32]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [33]:
filtered_documents = [d for d in documents if d['course'] == 'machine-learning-zoomcamp']
len(filtered_documents)
filtered_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

In [65]:
from tqdm import tqdm
import numpy as np

embeddings = []
for doc in tqdm(filtered_documents):
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    embeddings.append(embedding_model.encode(qa_text))
                      
X = np.array(embeddings)


100%|██████████| 375/375 [01:52<00:00,  3.35it/s]


## Question 2. Creating the embeddings.
What's the shape of X?

In [35]:
X.shape

(375, 768)

In [36]:
scores = X.dot(v)

## Question 3: Search.
What's the highest score in the results?

In [37]:
max(scores)

0.6506574

In [None]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
search_engine.search(v, num_results=5)

In [40]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [42]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [41]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [63]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        v_q = embedding_model.encode(q['question'])
        results = search_function(v_q, num_results=5)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
    }

## Question 4. Hit-rate for our search engine.
What did you get?

In [64]:
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
evaluate(ground_truth, search_engine.search)

100%|██████████| 1830/1830 [02:35<00:00, 11.80it/s]


{'hit_rate': 0.9398907103825137}

In [72]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [73]:
for doc, embedding in zip(filtered_documents, X):
    doc['question_text_vector'] = embedding.tolist()


In [75]:
for doc in tqdm(filtered_documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 375/375 [00:07<00:00, 48.01it/s]


In [76]:
user_query = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_query)

In [90]:
def elastic_search_knn(vector):
    knn = {
        "field": "question_text_vector",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

## Question 5: Indexing with Elasticsearch.
What's the ID of the document with the highest score?

In [91]:
elastic_search_knn(v)[0]['id']

'ee58a693'

In [92]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        v_q = embedding_model.encode(q['question'])
        results = search_function(v_q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
    }

## Question 6: Hit-rate for Elasticsearch
What's hitrate for our dataset for Elastic?

In [93]:
evaluate(ground_truth, elastic_search_knn)

100%|██████████| 1830/1830 [02:48<00:00, 10.85it/s]


{'hit_rate': 0.9398907103825137}