# Evaluating vector serach retrieval

In this section we will again evaluate our searching algorithm measuring the retrieval results against a ground truth dataset, but we will focus on searching methods which use embbedings.

In [2]:
import json

from tqdm import tqdm
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

  from .autonotebook import tqdm as notebook_tqdm


In the sentence transformers library e will search for the model with the best serach performance but tankinng into account the Model Size as well

In [3]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
v = model.encode("I just discovered the course. Can I still join?")

In [11]:
len(v)

384

In [35]:
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': 'eb9088a7b378', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'OAMBaAMSQ36711wobrForQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [36]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"},
            "id":{"type":"keyword"},
            "question_vector":{
                "type":"dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector":{
                "type":"dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector":{
                "type":"dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [37]:
es_client.indices.get(index=index_name)

ObjectApiResponse({'course-questions': {'aliases': {}, 'mappings': {'properties': {'course': {'type': 'keyword'}, 'id': {'type': 'keyword'}, 'question': {'type': 'text'}, 'question_text_vector': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine'}, 'question_vector': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine'}, 'section': {'type': 'text'}, 'text': {'type': 'text'}, 'text_vector': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine'}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'course-questions', 'creation_date': '1721616507424', 'number_of_replicas': '0', 'uuid': 'albBvYnWQwSEEUl7cii46w', 'version': {'created': '8040399'}}}}})

In [38]:

for doc in tqdm(documents):

    question = doc['question']
    text = doc['text']
    question_text = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(question_text)

    

100%|██████████| 948/948 [01:42<00:00,  9.21it/s]


In [39]:

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:55<00:00, 17.13it/s]


In [40]:
query = 'I just discovered the course. Can I still join it?'

In [41]:
query_vector = model.encode(query)

In [46]:
search_query = {
    "knn": {
        "field": "question_vector",
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": "data-engineering-zoomcamp"
            }
        }
    },
    "_source": ["text", "section", "question", "course", "id"]
}    


In [47]:

es_results = es_client.search(
    index=index_name,
    body=search_query
)

In [48]:
es_results

ObjectApiResponse({'took': 45, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 0.89216983, 'hits': [{'_index': 'course-questions', '_id': 'msJY2JABFPRWa2sivAsY', '_score': 0.89216983, '_source': {'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp', 'section': 'General course-related questions', 'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'id': '7842b56a'}}, {'_index': 'course-questions', '_id': 'n8JY2JABFPRWa2sivQtE', '_score': 0.8608285, '_source': {'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp', 'section': 'General course-related questions', 'text': 'Yes, we will keep all the materials after

In [51]:
result_docs = [hit['_source'] for hit in es_results['hits']['hits']]


In [52]:
result_docs

[{'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': '7842b56a'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'id': 'a482086d'},
 {'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp',
  'section': '

In [57]:
def elastic_search_knn(field, vector, course):

    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    
    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    return [hit['_source'] for hit in es_results['hits']['hits']]


In [58]:
elastic_search_knn("question_vector", query_vector, "data-engineering-zoomcamp")

[{'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': '7842b56a'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'id': 'a482086d'},
 {'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp',
  'section': '

In [63]:
import pandas as pd

df_ground_truth = pd.read_csv("groud-truth-data.csv")
ground_truth = df_ground_truth.to_dict(orient="records")

In [59]:
import functools
import operator
def hit_rate(relevance_total):
    cnt = functools.reduce(operator.add, [sum(row) for row in relevance_total])
    return cnt/len(relevance_total)

In [60]:
def mmr(relevance_total):
    total_score = functools.reduce(
        operator.add, 
        [
            sum(row) for row in [
                [int(element)/(i + 1) for i, element in enumerate(row)] 
                for row in relevance_total
            ]
        ]
    )
    return total_score / len(relevance_total)

In [61]:
def evaluate(ground_truth, search_function):

    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mmr": mmr(relevance_total)
    }

In [62]:
def question_vector_knn(q):
    
    question = q['question']
    course = q['course']

    v_q = model.encode(question)  

    return elastic_search_knn("question_vector", v_q, course)

In [64]:
evaluate(ground_truth, question_vector_knn)

100%|██████████| 4627/4627 [05:49<00:00, 13.25it/s]


{'hit_rate': 0.7741517181759239, 'mmr': 0.6666810748505158}

In [65]:
def text_vector_knn(q):
    
    question = q['question']
    course = q['course']

    v_q = model.encode(question)  

    return elastic_search_knn("text_vector", v_q, course)

In [67]:
evaluate(ground_truth, text_vector_knn)

100%|██████████| 4627/4627 [05:32<00:00, 13.93it/s]


{'hit_rate': 0.9183055975794251, 'mmr': 0.824306606152295}

In [69]:
def question_text_vector_knn(q):
    
    question = q['question']
    course = q['course']

    v_q = model.encode(question)  

    return elastic_search_knn("question_text_vector", v_q, course)

In [70]:
evaluate(ground_truth, question_text_vector_knn)

100%|██████████| 4627/4627 [05:45<00:00, 13.38it/s]


{'hit_rate': 0.9183055975794251, 'mmr': 0.824306606152295}

In [77]:
def elastic_search_knn_combined(vector, course):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "term": {
                                    "course": course
                                }
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'question_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_vector') + 
                                    cosineSimilarity(params.query_vector, 'question_text_vector') + 
                                    1
                                """,
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ],
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        },
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    return [hit['_source'] for hit in es_results['hits']['hits']]

In [79]:
elastic_search_knn_combined(query_vector, "data-engineering-zoomcamp")

[{'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': '7842b56a'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'id': 'a482086d'},
 {'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course'

In [80]:
def vector_combined_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn_combined(v_q, course)

evaluate(ground_truth, vector_combined_knn)

100%|██████████| 4627/4627 [05:43<00:00, 13.48it/s]


{'hit_rate': 0.9033931272963043, 'mmr': 0.804480945176861}