In [1]:
import json

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
pip install sentence_transformers==2.7.0

Note: you may need to restart the kernel to use updated packages.


In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [5]:
user_question = "I just discovered the course. Can I still join it?"
embed = model.encode(user_question)
print(embed[0])

0.078222655


In [6]:
len(model.encode("I just discovered the course. Can I still join it?"))

768

In [7]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [8]:
fil = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

In [9]:
import numpy as np
embeddings = []
for doc in fil:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = model.encode(qa_text)
    doc['text_vector'] = embedding.tolist()
    embeddings.append(embedding)
X = np.array(embeddings)
print(X.shape)

(375, 768)


In [10]:
scores = X.dot(embed)
print(scores.max())

0.6506573


In [11]:

class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]
search_engine = VectorSearchEngine(documents=fil, embeddings=X)


In [12]:
import pandas as pd
ground_truth_url = f'{base_url}/03-vector-search/eval/ground-truth-data.csv?raw=1'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
print("Sample ground truth records:", ground_truth[:3])


Sample ground truth records: [{'question': 'Where can I sign up for the course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}, {'question': 'Can you provide a link to sign up?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}, {'question': 'Is there an FAQ for this Machine Learning course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}]


In [13]:
hits = 0
for item in ground_truth:
    query_embedding = model.encode(item['question'])
    results = search_engine.search(query_embedding, num_results=5)
    if any('id' in result and result['id'] == item['document'] for result in results):
        hits += 1

hitrate = hits / len(ground_truth)
print(hitrate)


0.9398907103825137


In [14]:
pip install elasticsearch


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [15]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

# Define the index settings and mappings
index_name = "course-questions"
index_settings = {
    "mappings": {
        "properties": {
            "section": {"type": "text"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 768}  # Adjust dimensions as needed
        }
    }
}

# Create the index if it doesn't exist
if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)

# Index the documents
for doc in documents:
    es_client.index(index=index_name, id=doc['id'], body=doc)


In [16]:
def elastic_search(query, index_name="course-questions"):
    search_query = {
        "size": 1,  # Get the top result
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    top_result = response['hits']['hits'][0]  # Get the top result
    return top_result['_id'], top_result['_score']  # Return the ID and score of the top document

user_query = "I just discovered the course. Can I still join it?"
top_result_id, top_result_score = elastic_search(user_query)
print(f"ID of the document with the highest score: {top_result_id}")


ID of the document with the highest score: ee58a693


In [17]:
def elastic_search(query, index_name="course-questions"):
    search_query = {
        "size": 5,  # Get the top 5 results
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs  # Return the list of documents


In [18]:
import pandas as pd

# Load the ground truth dataset
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

def calculate_hit_rate_es():
    hits = 0
    for item in ground_truth:
        query = item['question']
        result_docs = elastic_search(query, index_name="course-questions")
        if any(result['id'] == item['document'] for result in result_docs):  # Compare with ground truth
            hits += 1

    hitrate = hits / len(ground_truth)
    return hitrate

hitrate_es = calculate_hit_rate_es()
print(f"Hit-rate for Elasticsearch: {hitrate_es:.2f}")


Hit-rate for Elasticsearch: 0.75
