# Text search evaluation

Onse we get our ground truth data, containing all the queries with their corresponding expected document to be retrieved, we are going to evaluate wether for each query our search technique returns the expected document. Our algorithmshould do:

    for each q in ground truth dataset:
        execute q
        check if d in the results

For this particualar case we will measure using the Hit Rate and the MRR

In [2]:
import json

from tqdm import tqdm
from elasticsearch import Elasticsearch

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': 'eb9088a7b378', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'OAMBaAMSQ36711wobrForQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"},
            "id":{"type":"keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:48<00:00, 19.57it/s]


In [7]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must":{
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter":{
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [8]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [9]:
import pandas as pd

In [10]:
df_ground_truth = pd.read_csv("groud-truth-data.csv")

In [11]:
df_ground_truth.head()

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef


In [13]:
ground_truth = df_ground_truth.to_dict(orient="records")

In [14]:
for q in ground_truth:
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    break

In [15]:
relevance

[True, False, False, False, False]

In [16]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [03:53<00:00, 19.79it/s]


In [17]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [True

In [18]:
example = [
    [True, False, False, False, False], # 1
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [False, False, True, False, False], # 1
    [False, False, False, False, False] # 0
]

* hit-rate (recall)
* Mean Reciprocal Rank (MMR)

In [19]:
# hit-rate (recall)
7/len(example)

0.5833333333333334

In [32]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1

    return cnt/len(relevance_total) 

In [33]:
hit_rate(example)

0.5833333333333334

In [30]:
import functools
import operator
def hit_rate(relevance_total):
    cnt = functools.reduce(operator.add, [sum(row) for row in relevance_total])
    return cnt/len(relevance_total)

In [31]:
hit_rate(example)

0.5833333333333334

In [None]:
## MMR
example = [
    [True, False, False, False, False], # 1
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [False, False, True, False, False], # 0.33
    [False, False, False, False, False] # 0
]

# 1 => 1
# 2 => 1/2 = 0.5
# 3 => 1/3 = 0.33
# 4 => 1/4 = 0.25
# 5 => 1/5 = 0.2
# rank = 1/rank
# none = 0

In [47]:
def mmr(relevance_total):
    total_score = 0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1/ (rank + 1)

    return total_score/len(relevance_total) 

In [35]:
mmr(example)

0.5277777777777778

In [49]:
def mmr(relevance_total):
    total_score = functools.reduce(
        operator.add, 
        [
            sum(row) for row in [
                [int(element)/(i + 1) for i, element in enumerate(row)] 
                for row in relevance_total
            ]
        ]
    )
    return total_score / len(relevance_total)

In [45]:
mmr(example)

0.5277777777777778

In [50]:
hit_rate(relevance_total), mmr(relevance_total)

(0.7395720769397017, 0.6029788920106625)

In [51]:
import minsearch

In [52]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

In [53]:
index.fit(documents)

<minsearch.Index at 0x2095de15180>

In [55]:
def minsearch_search(query, course):
    boost = {"question":3, "section":0.5}

    results = index.search(
        query=query,
        filter_dict={"course": course},
        boost_dict = boost,
        num_results = 5
    )

    return results

In [56]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)
    
hit_rate(relevance_total), mmr(relevance_total)

100%|██████████| 4627/4627 [00:19<00:00, 231.59it/s]


(0.7722066133563864, 0.661454506159499)

Compare with ES results:

(0.7395720769397017, 0.6029788920106625)

In [57]:
def evaluate(ground_truth, search_function):

    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mmr": mmr(relevance_total)
    }

In [58]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

100%|██████████| 4627/4627 [03:54<00:00, 19.73it/s]


{'hit_rate': 0.7395720769397017, 'mmr': 0.6029788920106625}

In [59]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:19<00:00, 241.78it/s]


{'hit_rate': 0.7722066133563864, 'mmr': 0.661454506159499}