In [1]:
!pip install sentence_transformers==2.7.0

Collecting sentence_transformers==2.7.0
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers==2.7.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers==2.7.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers==2.7.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers==2.7.0)
  Using cache

In [10]:
from sentence_transformers import SentenceTransformer

In [11]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [12]:
user_question = "I just discovered the course. Can I still join it?"

In [29]:
embedding_model.encode(user_question)[0]

0.07822261

In [17]:
import requests

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [20]:
filtered_docs = [entry for entry in documents if entry.get('course') == 'machine-learning-zoomcamp']

In [21]:
len(filtered_docs)

375

In [25]:
embeddings = []

for doc in filtered_docs:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text).tolist()
    embeddings.append(embedding)

In [27]:
import numpy as np
X = np.array(embeddings)

In [28]:
X.shape

(375, 768)

In [52]:
v = np.array(embedding_model.encode(user_question))

In [31]:
scores = X.dot(v)

In [32]:
max(scores)

0.6506574183148458

In [46]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_docs, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

In [35]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [36]:
search_engine = VectorSearchEngine(documents=ground_truth, embeddings=X)
search_engine.search(v, num_results=5)

[{'question': 'Can I still interact with instructors after missing a session?',
  'course': 'machine-learning-zoomcamp',
  'document': '5170565b'},
 {'question': 'Is the focus of the course more on practice or theory?',
  'course': 'machine-learning-zoomcamp',
  'document': 'ecca790c'},
 {'question': 'Are the course videos live or pre-recorded?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'What does a low standard deviation indicate about the values?',
  'course': 'machine-learning-zoomcamp',
  'document': '266faa6d'},
 {'question': 'Is prior knowledge in math necessary for this course?',
  'course': 'machine-learning-zoomcamp',
  'document': 'c25b3de4'}]

In [37]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [61]:
from tqdm.auto import tqdm
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q, 5)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
    }

In [62]:
search_engine_gt = VectorSearchEngine(documents=ground_truth, embeddings=X)

In [63]:
X.dtype

dtype('float64')

In [64]:
def vector_search_function(q, num_results):
    query = q['question']
    v_query = embedding_model.encode(query)
    return search_engine.search(v_query, num_results=num_results)

In [65]:
results = evaluate(ground_truth, vector_search_function)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [66]:
results

{'hit_rate': 0.9398907103825137}

In [None]:
operations = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

In [None]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.14.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.2/480.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elastic-transport<9,>=8.13 (from elasticsearch)
  Downloading elastic_transport-8.13.1-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.5/64.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.13.1 elasticsearch-8.14.0


In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('https://rwuqqkpsmyzv.share.zrok.io', max_retries=10, retry_on_timeout=True)

es_client.info()

ObjectApiResponse({'name': '6ae89ee02e9f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'iuosmAPNTqWwEaQZX9cpWw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [None]:
index_name = "course-questions"

for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

AuthorizationException(403, '<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body>\r\n<center><h1>403 Forbidden</h1></center>\r\n</body>\r\n</html>\r\n')
AuthorizationException(403, '<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body>\r\n<center><h1>403 Forbidden</h1></center>\r\n</body>\r\n</html>\r\n')
AuthorizationException(403, '<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body>\r\n<center><h1>403 Forbidden</h1></center>\r\n</body>\r\n</html>\r\n')
AuthorizationException(403, '<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body>\r\n<center><h1>403 Forbidden</h1></center>\r\n</body>\r\n</html>\r\n')
AuthorizationException(403, '<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body>\r\n<center><h1>403 Forbidden</h1></center>\r\n</body>\r\n</html>\r\n')
