In [1]:
# Importing necessary libraries
import pandas as pd
from elasticsearch import Elasticsearch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load the QA dataset to index
data = pd.read_csv('../data/investment_data.csv') # Could be the sample 
records = data.to_dict(orient='records')
# Load the ground truth dataset
ground_truth_df = pd.read_csv('ground_truth.csv')
ground_truth = ground_truth_df.to_dict(orient='records')

### Create the Ranking Metrics

For this section we will create some ranking metrics to evaluate the perfromance of different retrieval methods. More specifically we will examine:
- **Hit Rate (HR) at k**: Counts from all the retrieval requests, how many of them contained the relevant documents in the top k results
- **Mean Reciprocal Rank (MRR)**: Takes into account also the rank of the relevant document, with responses with the relevant document ranked higher with have a bigger score

To learn more about these measures read: [20 Popular Machine Learning Metrics. Part 2: Ranking, & Statistical Metrics](https://medium.com/towards-data-science/20-popular-machine-learning-metrics-part-2-ranking-statistical-metrics-22c3e5a937b60)

In [67]:
# Create the Hit Rate metric
def hit_rate(relevance_total):
    # Find the number of relevant documents from the retrieved documents
    return sum(any(line) for line in relevance_total) / len(relevance_total)

In [68]:
# Create the MRR metric
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [70]:
# Create a function to evaluate different search_functions
def evaluate(ground_truth, search_function):
    relevance_total = []

    for record in tqdm(ground_truth):
        record_id = record['id']
        # Create a request for each query in ground truth
        results = search_function(record)
        relevance = [d['id'] == record_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Keyword Search

In [6]:
# Initialize the client 
es_client = Elasticsearch('http://localhost:9200')

# Create the Schema of the Elastic Search Index for Keyword search
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "context": {"type": "text"},
            "ticker": {"type": "keyword"}, 
            "company": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}

# Provide the name of the index
index_name = "investment-info"
# Check if the index exists
if es_client.indices.exists(index=index_name):
    # Delete the existing index
    es_client.indices.delete(index=index_name)
# Create the elastic search index
response = es_client.indices.create(index=index_name, body=index_settings)
# Verify that elastic search is created
print(response)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'investment-info'}


In [7]:
# Fetch all the documents into the elastic search index
for record in tqdm(records):
    es_client.index(index = index_name, document=record)

100%|██████████| 6990/6990 [00:13<00:00, 510.09it/s]


In [None]:
# Parameteres to fine-tune

In [96]:
# Create a keyword search function to retrieve document form the elastic search
def keyword_search(query, company, boosting):
    # Create the query
    search_query = {
        # Specifying the number of documents to be retrieved
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        # Add the user query
                        "query": query,
                        # Include the text fields to search
                        "fields": ["question^{question}".format(**boosting), "answer^{answer}".format(**boosting), "context^{context}".format(**boosting)], # Give a boosting of 2 in the question field
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "company": company
                    }
                }
        }
    }
    }
    # Query the Elastic Search 
    response = es_client.search(index=index_name, body=search_query)

    # Parse the response of elastic search
    results = []
    for hit in response['hits']['hits']:
        results.append(hit['_source'])
    
    return results

In [100]:
# Create a list with all the possible boosting combinations
boosting_pairs =[
{'question': 1, 'answer': 1, 'context': 1},
{'question': 2, 'answer': 1, 'context': 1},
{'question': 3, 'answer': 1, 'context': 1},
{'question': 1, 'answer': 2, 'context': 1},
{'question': 1, 'answer': 3, 'context': 1},
{'question': 1, 'answer': 1, 'context': 2},
{'question': 1, 'answer': 1, 'context': 3}
]

In [102]:
# Create a function to evaluate different search_functions
def keyword_evaluate(ground_truth, boosting):
    relevance_total = []

    for record in tqdm(ground_truth):
        record_id = record['id']
        # Extract the user query
        question = record['question']
        # Extract keyword for filtering the results
        company = record['company']
        # Create a request for each query in ground truth
        results = keyword_search(question, company, boosting)
        relevance = [d['id'] == record_id for d in results]
        relevance_total.append(relevance)

    return {
        'question_boost': boosting['question'],
        'answer_boost': boosting['answer'],
        'context_boost': boosting['context'],
        'hit_rate': round((hit_rate(relevance_total)*100),2),
        'mrr': round((mrr(relevance_total)*100),2),
    }

In [103]:
keyword_results = []
# Evaluate all possible combinations of keyword search to choose the best one
for boosting_pair in boosting_pairs:
    keyword_results.append(keyword_evaluate(ground_truth, boosting_pair))

100%|██████████| 5175/5175 [00:56<00:00, 92.41it/s] 
100%|██████████| 5175/5175 [00:52<00:00, 97.75it/s] 
100%|██████████| 5175/5175 [00:57<00:00, 89.32it/s] 
100%|██████████| 5175/5175 [00:54<00:00, 95.16it/s] 
100%|██████████| 5175/5175 [00:52<00:00, 99.22it/s] 
100%|██████████| 5175/5175 [00:54<00:00, 94.23it/s] 
100%|██████████| 5175/5175 [00:53<00:00, 96.83it/s] 


In [104]:
# Display the results
keyword_findings = pd.DataFrame(keyword_results)
# Display the results
keyword_findings

Unnamed: 0,question_boost,answer_boost,context_boost,hit_rate,mrr
0,1,1,1,88.58,78.01
1,2,1,1,85.93,74.03
2,3,1,1,84.21,72.01
3,1,2,1,81.14,66.77
4,1,3,1,72.7,58.75
5,1,1,2,85.04,70.7
6,1,1,3,83.46,69.4


In [105]:
keyword_findings.to_csv('keyword_findings.csv', index=False)

### Vector Search

In [8]:
# Initialize the selected model to create the embeddings
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Create an initial vector / embedding of the answer using the model
res = model.encode(ground_truth[0]['question'])
# Find the dimensionality of this vector
len(res)



384

In [10]:
# Create the embeddings for each record in our QA dataset
for record in tqdm(records):
    # Extract the text fields you want to embed along with threir combinations
    question = record['question']
    answer = record['answer']
    context = record['context']
    question_answer = question + ' ' + answer
    answer_context = answer + ' ' + context
    question_context = question + ' ' + context
    question_answer_context = question + ' ' + answer + ' ' + context
    
    # Create the embedding for each text field
    record['question_vector'] = model.encode(question)
    record['answer_vector'] = model.encode(answer)
    record['context_vector'] = model.encode(context)
    record['question_answer_vector'] = model.encode(question_answer)
    record['answer_context_vector'] = model.encode(answer_context)
    record['question_context_vector'] = model.encode(question_context)
    record['question_answer_context_vector'] = model.encode(question_answer_context)

100%|██████████| 6990/6990 [23:58<00:00,  4.86it/s]


In [54]:
# Create the Schema of the Elastic Search Index for vector search
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "context": {"type": "text"},
            "ticker": {"type": "keyword"}, 
            "company": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,            # Here we are using the dimensionality of the embedding we want to store 
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "context_vector": {
                "type": "dense_vector",
                "dims": 384,     
                "index": True,
                "similarity": "cosine"
            },
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "answer_context_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "question_context_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "question_answer_context_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Provide the name of the index
index_name = "investment-info"
# Check if the index exists
if es_client.indices.exists(index=index_name):
    # Delete the existing index
    es_client.indices.delete(index=index_name)
# Create the elastic search index
response = es_client.indices.create(index=index_name, body=index_settings)
# Verify that elastic search is created
print(response)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'investment-info'}


In [55]:
# Fetch all the documents into the elastic search index
for record in tqdm(records):
    es_client.index(index = index_name, document=record)

100%|██████████| 6990/6990 [00:51<00:00, 136.52it/s]


In [76]:
# Create the new elastic seach query for the vector search

def vector_search(field, vector, company):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "company": company
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ['question', 'answer', 'context', 'ticker' ,'company' ,'id']
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [14]:
# Create the embeddings for the ground truth to use for validations
for record in tqdm(ground_truth):
    # Extract the question for each record
    question = record['question']
    # Create the embedding of each user query and store it in the ground truth records
    record['question_vector'] = model.encode(question)

100%|██████████| 5175/5175 [01:27<00:00, 58.83it/s]


In [90]:
# Create a function to evaluate different search_functions
def vector_evaluate(ground_truth, field):
    relevance_total = []

    for record in tqdm(ground_truth):
        record_id = record['id']
        # Extract the vector field
        question = record['question_vector']
        # Extract keyword for filtering the results
        company = record['company']
        # Create a request for each query in ground truth
        results = vector_search(field, question, company)
        relevance = [d['id'] == record_id for d in results]
        relevance_total.append(relevance)

    return {
        'field': field,
        'hit_rate': round((hit_rate(relevance_total)*100),2),
        'mrr': round((mrr(relevance_total)*100),2),
    }

In [91]:
# Create a list all possible combinations of vector search
field_list = ['question_vector','answer_vector','context_vector',
'question_answer_vector','answer_context_vector',
'question_context_vector','question_answer_context_vector']

In [92]:
vector_results = []
# Evaluate all possible combinations of vector search to choose the best one
for field in field_list:
    vector_results.append(vector_evaluate(ground_truth, field))

100%|██████████| 5175/5175 [00:46<00:00, 110.38it/s]
100%|██████████| 5175/5175 [00:47<00:00, 109.92it/s]
100%|██████████| 5175/5175 [00:47<00:00, 110.01it/s]
100%|██████████| 5175/5175 [00:55<00:00, 93.01it/s] 
100%|██████████| 5175/5175 [01:05<00:00, 78.68it/s] 
100%|██████████| 5175/5175 [00:58<00:00, 88.40it/s] 
100%|██████████| 5175/5175 [01:01<00:00, 84.41it/s] 


In [94]:
# Display the results
vector_findings = pd.DataFrame(vector_results)
# Display the results
vector_findings

Unnamed: 0,field,hit_rate,mrr
0,question_vector,86.42,74.4
1,answer_vector,63.92,54.51
2,context_vector,82.34,67.96
3,question_answer_vector,86.86,75.48
4,answer_context_vector,84.62,72.39
5,question_context_vector,87.65,76.79
6,question_answer_context_vector,88.04,76.64


In [95]:
vector_findings.to_csv('vector_findings.csv', index=False)

### Hybrid Search

In [108]:
# Adjusting the ES query for hybrid search with the best of two options
def hybrid_search(a , query, vector, company):
    # This is the query for the vector search with the best field 
    vector_query = {
        "field": 'question_answer_context_vector',
        "query_vector": vector, # This will recieve a vector of the user query
        "k": 5,
        "num_candidates": 10000,
        "boost": a, # Here you can set up the weight the vector search will have in the results
        "filter": {
            "term": {
                "company": company
            }
        }
    }
    # This is the query for the keyword search with best boosting
    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query, # This will recieve the user query itself
                    "fields": ["question", "answer", "context"],
                    "type": "best_fields",
                    "boost": 1-a, # Here you can set up the weight the keyword search will have in the results
                }
            },
            "filter": {
                "term": {
                    "company": company
                }
            }
        }
    }
    # Here is the combination of the two search methods
    search_query = {
        "knn": vector_query,
        "query": keyword_query,
        "size": 5,   # This is the number of the returned documents
        "_source": ['question', 'answer', 'context', 'ticker' ,'company' ,'id'] # The fields that will be returned for each retrieved document 
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [109]:
# Create a function to evaluate different search_functions
def hybrid_evaluate(ground_truth, a):
    relevance_total = []

    for record in tqdm(ground_truth):
        record_id = record['id']
        # Extract the vector field
        question = record['question']
        vector = record['question_vector']
        # Extract keyword for filtering the results
        company = record['company']
        # Create a request for each query in ground truth
        results = hybrid_search(a , question, vector, company)
        relevance = [d['id'] == record_id for d in results]
        relevance_total.append(relevance)

    return {
        'vector_boosting': a,
        'keyword_boosting': 1-a,
        'hit_rate': round((hit_rate(relevance_total)*100),2),
        'mrr': round((mrr(relevance_total)*100),2),
    }

In [111]:
# List of potential weights for each method
a_values = [1.0, 0.75, 0.5, 0.25, 0.0]

In [112]:
hybrid_results = []
# Evaluate all possible combinations of hybrid search to choose the best one
for a in a_values:
    hybrid_results.append(hybrid_evaluate(ground_truth, a))

100%|██████████| 5175/5175 [01:03<00:00, 81.56it/s] 
100%|██████████| 5175/5175 [01:09<00:00, 74.19it/s] 
100%|██████████| 5175/5175 [01:14<00:00, 69.64it/s]
100%|██████████| 5175/5175 [01:11<00:00, 72.13it/s]
100%|██████████| 5175/5175 [01:04<00:00, 79.90it/s] 


In [113]:
# Store the results in a Dataframe
hybrid_findings = pd.DataFrame(hybrid_results)
# Display the results
hybrid_findings

Unnamed: 0,vector_boosting,keyword_boosting,hit_rate,mrr
0,1.0,0.0,88.04,76.64
1,0.75,0.25,90.59,79.97
2,0.5,0.5,89.49,78.86
3,0.25,0.75,88.77,78.41
4,0.0,1.0,88.58,78.01


In [114]:
hybrid_findings.to_csv('hybrid_findings.csv', index=False)

## Findings for the best retrieval method

- Hybrid method with 0.75 weight to the vector search and 0.25 weight to the keyword search
    - Keyword search: No boosted was used for any text field - best results
    - Vector search: An embedding of a combination of question, answer and context was used - best results

### Document Reranking