In [4]:
# Load the documents with id list

import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
# Indexing parameter configuration

from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"}, # We added extra propoerty id
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
# Index the document

from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 219.71it/s]


In [7]:
# Search query, we have added course field as a filter

def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [8]:
elastic_search(
    query="The course has already started, can I still enroll?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [9]:
# Lets read our ground truth data and iterate over it
import pandas as pd

df_ground_truth = pd.read_csv('ground-truth-data.csv')

#Turn it to a dictionary
ground_truth = df_ground_truth.to_dict(orient='records')

In [10]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4675/4675 [00:20<00:00, 227.26it/s]


In [11]:
relevance_total

[[False, True, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [False, False, False, False, True],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, True, False],
 [True, False, False, False, False],
 [False, False, False, False,

In [12]:
# Calculate Hit Rate

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [13]:
# Calculate MRR

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [14]:
hit_rate(relevance_total)

0.7405347593582887

In [15]:
mrr(relevance_total)

0.5942816399286994

In [16]:
# Lets evaluate Min Search text retrieval technique

import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x7673bf6344a0>

In [17]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [18]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4675/4675 [00:14<00:00, 317.67it/s]


In [19]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7848128342245989, 0.6680427807486635)

In [20]:
# Min search seems to perform better than elastic search in this case

MS - (0.7848128342245989, 0.6680427807486635)
ES - (0.7405347593582887, 0.5942816399286994)

In [21]:
# Putting everything together

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [22]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4675/4675 [00:12<00:00, 381.90it/s]


{'hit_rate': 0.7405347593582887, 'mrr': 0.5942816399286994}

In [23]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4675/4675 [00:14<00:00, 314.44it/s]


{'hit_rate': 0.7848128342245989, 'mrr': 0.6680427807486635}