# Q1

In [1]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [2]:
user_question = "I just discovered the course. Can I still join it?"

In [3]:
embedding_model.encode(user_question)[0]

0.07822265

# Q2

In [4]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

In [6]:
len(documents)

948

In [7]:
#"machine-learning-zoomcamp"
ml_documents = [d for d in documents if d['course'] == 'machine-learning-zoomcamp']

In [8]:
len(ml_documents)

375

In [9]:
import numpy as np

def get_embeddings(corpus):
    embeddings = []
    
    for d in corpus:
        question = d['question']
        text = d['text']
        qa_text = f'{question} {text}'
        embedding = embedding_model.encode(qa_text).tolist()
        embeddings.append(embedding)
        d['question_vector'] = embedding_model.encode(question)
        d['text_vector'] = embedding_model.encode(text)
        d["qa_text_vector"] = embedding

    return corpus, embeddings

In [10]:
ml_documents, embeddings = get_embeddings(ml_documents)

In [11]:
X = np.array(embeddings)

In [12]:
X.shape

(375, 768)

# Q3

In [13]:
v = embedding_model.encode(user_question)
v.shape

(768,)

In [14]:
scores = X.dot(v)

In [15]:
max(scores)

0.6506573743245914

# Q4

In [16]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=ml_documents, embeddings=X)
%time search_engine.search(v, num_results=1)

CPU times: user 696 μs, sys: 13 μs, total: 709 μs
Wall time: 479 μs


[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'question_vector': array([ 0.07717123, -0.04749377,  0.02866317, -0.01141076,  0.08245005,
         -0.04042277, -0.02613374,  0.04122075, -0.04840769,  0.01509397,
         -0.00149666, -0.01334824,  0.0461828 ,  0.02318399,  0.04547324,
         -0.00809989,  0.0771832 , -0.03334848, -0.0418002 , -0.02304633,
         -0.01866887,  0.00298914, -0.00631757,  0.03931605, -0.02289939,
          0.07724462,  0.06296353,  0.037800

In [17]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        # idx = np.argsort(-scores)[:num_results]
        idx_partitioned = np.argpartition(-scores, num_results-1)[:num_results]
        idx = idx_partitioned[np.argsort(-scores[idx_partitioned])]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=ml_documents, embeddings=X)
%time search_engine.search(v, num_results=1)

CPU times: user 668 μs, sys: 13 μs, total: 681 μs
Wall time: 456 μs


[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'question_vector': array([ 0.07717123, -0.04749377,  0.02866317, -0.01141076,  0.08245005,
         -0.04042277, -0.02613374,  0.04122075, -0.04840769,  0.01509397,
         -0.00149666, -0.01334824,  0.0461828 ,  0.02318399,  0.04547324,
         -0.00809989,  0.0771832 , -0.03334848, -0.0418002 , -0.02304633,
         -0.01866887,  0.00298914, -0.00631757,  0.03931605, -0.02289939,
          0.07724462,  0.06296353,  0.037800

In [18]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [19]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [20]:
from tqdm import tqdm

In [21]:
# hitrate for minssearch (aka keyword search)
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [22]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

In [23]:
search_engine = VectorSearchEngine(documents=ml_documents, embeddings=X)
def vector_search(q):
    v_q = embedding_model.encode(q)
    return search_engine.search(v_q, num_results=5)

In [24]:
evaluate(ground_truth, vector_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:06<00:00, 27.58it/s]


{'hit_rate': 0.9398907103825137}

# Q5

In [25]:
import elasticsearch as es
es.__version__

(8, 4, 3)

In [26]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '3c125f618e2a', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'zkKYa5BIRfGQzo4q0fxIwQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [27]:
dims = len(X[0])

In [28]:
user_question

'I just discovered the course. Can I still join it?'

In [29]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
            "qa_text_vector": {
                "type": "dense_vector",
                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions-ml"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(
    index=index_name, 
    settings=index_settings["settings"],
    mappings=index_settings["mappings"]
)

for doc in ml_documents:
    es_client.index(index=index_name, document=doc)

In [30]:
def elastic_search_knn(vector, num_results=5):
    knn = {
        "field": "qa_text_vector",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000, 
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    index_name = "course-questions-ml"
    es_results = es_client.search(
        index=index_name,
        knn=search_query['knn'], 
        source=search_query['_source']
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs[:num_results]


def qa_vector_knn(q):
    v_q = embedding_model.encode(q)
    return elastic_search_knn(v_q, num_results=5)

In [31]:
res = qa_vector_knn(user_question)
res[0]['id']

'ee58a693'

# Q6

In [32]:
# hitrate for elasticsearch
evaluate(ground_truth, qa_vector_knn)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:38<00:00, 18.67it/s]


{'hit_rate': 0.9404371584699454}

validating results using mrr (expectation: elasticsearch to perform worse than all vector search, actual: it performed better)

In [33]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [34]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit-rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [35]:
evaluate(ground_truth, vector_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:07<00:00, 26.91it/s]


{'hit-rate': 0.9398907103825137, 'mrr': 0.8516484517304189}

In [36]:
evaluate(ground_truth, qa_vector_knn)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:35<00:00, 19.11it/s]


{'hit-rate': 0.9398907103825137, 'mrr': 0.8516484517304189}

unexecpted results. verify with all documents

In [37]:
documents, embeddings = get_embeddings(documents)
X = np.array(embeddings)
X.shape

(948, 768)

In [38]:
search_engine = VectorSearchEngine(documents=documents, embeddings=X)

def vector_search(q):
    v_q = embedding_model.encode(q)
    return search_engine.search(v_q, num_results=5)

In [39]:
# hitrate for vectorsearch w/o elasticsearch
evaluate(ground_truth, vector_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:07<00:00, 26.95it/s]


{'hit-rate': 0.9218579234972678, 'mrr': 0.8205191256830604}

In [42]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(
    index=index_name, 
    settings=index_settings["settings"],
    mappings=index_settings["mappings"]
)

for doc in ml_documents:
    es_client.index(index=index_name, document=doc)

def elastic_search_knn(vector, num_results=5):
    knn = {
        "field": "qa_text_vector",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000, 
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    index_name = "course-questions"
    es_results = es_client.search(
        index=index_name,
        knn=search_query['knn'], 
        source=search_query['_source']
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [43]:
# hitrate for elasticsearch
evaluate(ground_truth, qa_vector_knn)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:38<00:00, 18.60it/s]


{'hit-rate': 0.9398907103825137, 'mrr': 0.8516484517304189}

vector search without elasticsearch results
```javascript
{'hit_rate': 0.9398907103825137} // ml documents run 1
{'hit-rate': 0.9398907103825137, 'mrr': 0.8516484517304189} // ml documents run 2
{'hit-rate': 0.9218579234972678, 'mrr': 0.8205191256830604} // all documents
```

vector search with elasticsearch results
```javascript
{'hit_rate': 0.9404371584699454} // ml documents run 1
{'hit-rate': 0.9398907103825137, 'mrr': 0.8516484517304189} // ml documents run 2
{'hit-rate': 0.9398907103825137, 'mrr': 0.8516484517304189} // all documents
```