In [5]:
import json
import hashlib
import dotenv
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

from openai import OpenAI
from collections import defaultdict
from tqdm.auto import tqdm

import pickle
import csv
import pandas as pd

import sys
sys.path.append("../01-Intro")
import minsearch

## Step 1: Prepare the documents

In [6]:
with open("documents.json", 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []
for course in docs_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [7]:
def generate_document_id(doc):

    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id

In [8]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [9]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [10]:
prompt_template = """
You emulate a student taking our course.
Formulate 5 questions this student might ask based on the FAQ record provided. The record should contain the answer to the questions, and the
questions should be complete and not too short. If possible, use as few words as possible from the record.

The record:
section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3", "question4", "question5"]

""".strip()

In [12]:
dotenv.load_dotenv('../.env')

True

In [13]:
ai_client = OpenAI()

In [14]:
def generate_questions(doc, prompt_template):
    
    prompt = prompt_template.format(**doc)
    
    resp = ai_client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content': prompt}],
    )
    json_response = json.loads(resp.choices[0].message.content)
    return json_response

In [15]:
try:
    with open('results.pkl', 'rb') as f_in:
        results = pickle.load(f_in)
except FileNotFoundError:
    results = {}

In [16]:
ids = defaultdict(list)
for doc in documents:
    ids[doc['id']].append(doc)

In [17]:
for doc in tqdm(documents):
    id = doc['id']
    if id in results:
        continue
    try:
        questions = doc['questions'] = generate_questions(doc, prompt_template)
        results[id] = questions
    except Exception as e:
        with open('results.pkl', 'wb') as f_out:
            pickle.dump(results, f_out)
        continue

with open('results.pkl', 'wb') as f_out:
    pickle.dump(results, f_out)

100%|██████████| 948/948 [00:00<00:00, 2195582.66it/s]


In [18]:
with open('results.pkl', 'wb') as f_out:
    pickle.dump(results, f_out)

In [19]:
doc_index = {doc['id']: doc for doc in documents}

In [20]:
final_results = []
for id, questions in results.items():
    doc = doc_index[id]
    for question in questions:
        final_results.append((question, doc['course'], id))



In [21]:
with open('ground-truth-data.csv', 'wt') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['question', 'course', 'document_id'])
    writer.writerows(final_results)

In [22]:
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': 'ad69993b5bed', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'uxoTwmPhSw2EfiNXPOhI9Q', 'version': {'number': '8.17.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'a091390de485bd4b127884f7e565c0cad59b10d2', 'build_date': '2025-02-28T10:07:26.089129809Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [23]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "section": {
                "type": "text"
            },
            "question": {
                "type": "text"
            },
            "course": {
                "type": "keyword"
            },
            "id": {
                "type": "keyword"
            },
        }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

for doc in documents:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [29]:
def es_query(query: str, course: str):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["text", "section", "question^3"],
                        "type": "best_fields"
                    }
                },
                    "filter": {
                        "term": {
                            "course": course
                            }
                        }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    results = []
    for hit in response['hits']['hits']:
        results.append(hit['_source'])
    return results

In [30]:
index = minsearch.Index(
    text_fields=["text", "section", "question"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

<minsearch.Index at 0x31e32f620>

In [31]:
def minsearch_search(query, course):
    boost = {'question':3, 'section':0.5}
    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [32]:
df_ground_truth = pd.read_csv('./ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [33]:
def relevance_logging(ground_truth: dict, search_method, ) -> list:
    relevance_tracking = []

    for query in tqdm(ground_truth):
        id = query['document_id']
        # if search_method.__name__ == 'es_query':
        resp = search_method(query=query['question'], course=query['course'])
        # else:
        #     resp = search_method(query)
        # print(resp)
        result_relevance = [d['id'] == id for d in resp]
        relevance_tracking.append(result_relevance)

    return relevance_tracking


def hit_rate(relevance_tracking: dict) -> float: 

    hits = sum(map(lambda x: True in x, relevance_tracking))
    return hits/len(relevance_tracking)

def mrr(relevance_tracking: dict) -> float:

    n = len(relevance_tracking)
    relevance_total = 0

    for r in relevance_tracking:
        try:
            index = r.index(True) + 1
            relevance_total += (1/index)
        except ValueError:
            continue
        
    return relevance_total / n

            




In [34]:
relevance_results_minsearch = relevance_logging(ground_truth=ground_truth, search_method=minsearch_search)
relevance_results_elastic_search = relevance_logging(ground_truth=ground_truth, search_method=es_query)

100%|██████████| 4655/4655 [00:08<00:00, 568.04it/s]
100%|██████████| 4655/4655 [00:18<00:00, 256.21it/s]


In [35]:
metrics_es = (hit_rate(relevance_results_elastic_search), mrr(relevance_results_elastic_search))
metrics_min = (hit_rate(relevance_results_minsearch), mrr(relevance_results_minsearch))

print(metrics_es)
print(metrics_min)

(0.7523093447905478, 0.612631578947369)
(0.8021482277121375, 0.6812853562477629)


In [36]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [37]:
encoding = model.encode("Is it worth bothering?")
print(len(encoding))

384


In [38]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "section": {
                "type": "text"
            },
            "question": {
                "type": "text"
            },
            "course": {
                "type": "keyword"
            },
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }

       }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [39]:
for doc in tqdm(documents):
    try:
        doc['question_vector'] = model.encode(doc['question'])
        doc['text_vector'] = model.encode(doc['text'])
        doc['question_text_vector'] = model.encode(doc['question'] + ' ' + doc['text'])
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

100%|██████████| 948/948 [01:32<00:00, 10.23it/s]


In [40]:
vector_search_term = model.encode("Can I enroll after the course has started?")

In [41]:
def elastic_search_knn(field, vector, course):
    
    # vector_search_term = model.encode(query)
    
    knn = {
            "field": field,
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10000,
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                    }
                }
            }
    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
        }

    es_results = es_client.search(index=index_name, 
                        body=search_query)

    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
                     

In [42]:
elastic_search_knn('question_vector', vector_search_term, 'data-engineering-zoomcamp')

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a482086d'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud ac