In [6]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [7]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "position": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "interview-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'interview-questions'})

In [8]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 145/145 [00:02<00:00, 53.09it/s]


In [9]:
def elastic_search(query, position):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "position": position
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [10]:
elastic_search(
    query="What is Data Engineering?",
    position="de"
)

[{'id': 'de0083',
  'question': 'What is data engineering?',
  'text': 'Data engineering is the practice of designing, building, and maintaining systems for collecting, storing, and analyzing large volumes of data. It involves creating data pipelines, optimizing data storage, and ensuring data quality and accessibility for data scientists and analysts.',
  'position': 'de',
  'section': 'Basic Data Engineering Interview Questions'},
 {'id': 'de0001',
  'question': 'What is Data Engineering (for you)?',
  'text': 'This may seem like a pretty basic data engineer interview questions, but regardless of your skill level, this may come up during your interview. Your interviewer wants to see what your specific definition of data engineering is, which also makes it clear that you know what the work entails.  So, what is it? In a nutshell, it is the act of transforming, cleansing, profiling, and aggregating large data sets. You can also take it a step further and discuss the daily duties of a d

In [11]:
import pandas as pd

In [12]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [13]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], position=q['position'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 726/726 [00:02<00:00, 311.72it/s]


In [15]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [16]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [17]:
hit_rate(relevance_total), mrr(relevance_total)

(0.5922865013774105, 0.41999540863177215)

In [18]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["position", "id"]
)

index.fit(documents)

<minsearch.Index at 0x73a3e369cce0>

In [22]:
def minsearch_search(query, position):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'position': position},
        boost_dict=boost,
        num_results=5
    )

    return results

In [23]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], position=q['position'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 726/726 [00:01<00:00, 380.29it/s]


In [40]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7722066133563864, 0.661454506159499)

Compare with ES results:
```
(0.5922865013774105, 0.41999540863177215)
```

In [24]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [25]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['position']))

100%|██████████| 726/726 [00:01<00:00, 450.85it/s]


{'hit_rate': 0.5922865013774105, 'mrr': 0.41999540863177215}

In [26]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['position']))

100%|██████████| 726/726 [00:01<00:00, 381.11it/s]


{'hit_rate': 0.6377410468319559, 'mrr': 0.5032369146005506}