In [1]:
import json
import hashlib
import dotenv
from elasticsearch import Elasticsearch

from openai import OpenAI
from collections import defaultdict
from tqdm.auto import tqdm

import pickle
import csv
import pandas as pd

import sys
sys.path.append("/workspaces/llm-zoomcamp/01-Intro")
import minsearch

## Step 1: Prepare the documents

In [2]:
with open("documents.json", 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []
for course in docs_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [3]:
def generate_document_id(doc):

    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [5]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [6]:
prompt_template = """
You emulate a student taking our course.
Formulate 5 questions this student might ask based on the FAQ record provided. The record should contain the answer to the questions, and the
questions should be complete and not too short. If possible, use as few words as possible from the record.

The record:
section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3", "question4", "question5"]

""".strip()

In [7]:
dotenv.load_dotenv('../.env')

True

In [8]:
ai_client = OpenAI()

In [9]:
def generate_questions(doc, prompt_template):
    
    prompt = prompt_template.format(**doc)
    
    resp = ai_client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content': prompt}],
    )
    json_response = json.loads(resp.choices[0].message.content)
    return json_response

In [10]:
try:
    with open('results.pkl', 'rb') as f_in:
        results = pickle.load(f_in)
except FileNotFoundError:
    results = {}

In [11]:
ids = defaultdict(list)
for doc in documents:
    ids[doc['id']].append(doc)

In [12]:
[k for k, v in ids.items() if len(v) > 1]

['593f7569']

In [13]:
for doc in tqdm(documents):
    id = doc['id']
    if id in results:
        continue
    try:
        questions = doc['questions'] = generate_questions(doc, prompt_template)
        results[id] = questions
    except Exception as e:
        with open('results.pkl', 'wb') as f_out:
            pickle.dump(results, f_out)
        continue

with open('results.pkl', 'wb') as f_out:
    pickle.dump(results, f_out)

  0%|          | 0/948 [00:00<?, ?it/s]

In [14]:
with open('results.pkl', 'wb') as f_out:
    pickle.dump(results, f_out)

In [15]:
doc_index = {doc['id']: doc for doc in documents}

In [16]:
final_results = []
for id, questions in results.items():
    doc = doc_index[id]
    for question in questions:
        final_results.append((question, doc['course'], id))



In [17]:
with open('ground-truth-data.csv', 'wt') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['question', 'course', 'document_id'])
    writer.writerows(final_results)

In [23]:
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': '4319a1f1bc03', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'l15JZvkuTYyTEdKVpDD_gw', 'version': {'number': '8.17.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'a091390de485bd4b127884f7e565c0cad59b10d2', 'build_date': '2025-02-28T10:07:26.089129809Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [24]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "section": {
                "type": "text"
            },
            "question": {
                "type": "text"
            },
            "course": {
                "type": "keyword"
            },
            "id": {
                "type": "keyword"
            },
        }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

for doc in documents:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [25]:
def es_query(query: str, course: str):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["text", "section", "question^3"],
                        "type": "best_fields"
                    }
                },
                    "filter": {
                        "term": {
                            "course": course
                            }
                        }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    results = []
    for hit in response['hits']['hits']:
        results.append(hit['_source'])
    return results

In [32]:
index = minsearch.Index(
    text_fields=["text", "section", "question"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

<minsearch.Index at 0x79c8577492e0>

In [34]:
def minsearch_search(q):
    boost = {'question':3, 'section':0.5}
    results = index.search(
        query=q['question'],
        filter_dict={'course': q['course']},
        boost_dict=boost,
        num_results=5
    )

    return results

In [41]:
es_query.__name__

'es_query'

In [26]:
df_ground_truth = pd.read_csv('./ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [42]:
def relevance_logging(ground_truth: dict, search_method, ) -> list:
    relevance_tracking = []

    for query in tqdm(ground_truth):
        id = query['document_id']
        if search_method.__name__ == 'es_query':
            resp = search_method(query=query['question'], course=query['course'])
        else:
            resp = search_method(query)
        # print(resp)
        result_relevance = [d['id'] == id for d in resp]
        relevance_tracking.append(result_relevance)

    return relevance_tracking


def hit_rate(relevance_tracking: dict) -> float: 

    hits = sum(map(lambda x: True in x, relevance_tracking))
    return hits/len(relevance_tracking)

def mrr(relevance_tracking: dict) -> float:

    n = len(relevance_tracking)
    relevance_total = 0

    for r in relevance_tracking:
        try:
            index = r.index(True) + 1
            relevance_total += (1/index)
        except ValueError:
            continue
        
    return relevance_total / n

            




In [28]:
relevance = relevance_logging(ground_truth=ground_truth)

  0%|          | 0/4655 [00:00<?, ?it/s]

In [44]:
relevance_logging(ground_truth=ground_truth, search_method=es_query)

  0%|          | 0/4655 [00:00<?, ?it/s]

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, False, False, True, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, True],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, 

In [29]:
hr = hit_rate(relevance)
print(hr)
mrr_metric = mrr(relevance)
print(mrr_metric)

0.7525241675617615
0.6125671321160048


[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'cours