In [1]:
# Importing necessary libraries
import pandas as pd
from elasticsearch import Elasticsearch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load the QA dataset to index
data = pd.read_csv('../data/investment_data.csv') # Could be the sample 
records = data.to_dict(orient='records')
# Load the ground truth dataset
ground_truth_df = pd.read_csv('ground_truth.csv')
ground_truth = ground_truth_df.to_dict(orient='records')

### Create the Ranking Metrics

### Keyword Search

In [6]:
# Initialize the client 
es_client = Elasticsearch('http://localhost:9200')

# Create the Schema of the Elastic Search Index for Keyword search
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "context": {"type": "text"},
            "ticker": {"type": "keyword"}, 
            "company": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}

# Provide the name of the index
index_name = "investment-info"
# Check if the index exists
if es_client.indices.exists(index=index_name):
    # Delete the existing index
    es_client.indices.delete(index=index_name)
# Create the elastic search index
response = es_client.indices.create(index=index_name, body=index_settings)
# Verify that elastic search is created
print(response)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'investment-info'}


In [7]:
# Fetch all the documents into the elastic search index
for record in tqdm(records):
    es_client.index(index = index_name, document=record)

100%|██████████| 6990/6990 [00:13<00:00, 510.09it/s]


In [None]:
# Parameteres to fine-tune

In [None]:
# Create a keyword search function to retrieve document form the elastic search
def keyword_search(query, company):
    # Create the query
    search_query = {
        # Specifying the number of documents to be retrieved
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        # Add the user query
                        "query": query,
                        # Include the text fields to search
                        "fields": ["question^2", "answer", "context"], # Give a boosting of 2 in the question field
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "company": company
                    }
                }
        }
    }
    }
    # Query the Elastic Search 
    response = es_client.search(index=index_name, body=search_query)

    # Parse the response of elastic search
    results = []
    for hit in response['hits']['hits']:
        results.append(hit['_source'])
    
    return results

### Vector Search

In [8]:
# Initialize the selected model to create the embeddings
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Create an initial vector / embedding of the answer using the model
res = model.encode(ground_truth[0]['question'])
# Find the dimensionality of this vector
len(res)



384

In [9]:
data.head()

Unnamed: 0,question,answer,context,ticker,filing,company,id
0,What area did NVIDIA initially focus on before...,NVIDIA initially focused on PC graphics.,"Since our original focus on PC graphics, we ha...",NVDA,2023_10K,Nvidia Corporation,4f2ccc3b
1,What are some of the recent applications of GP...,Recent applications of GPU-powered deep learni...,Some of the most recent applications of GPU-po...,NVDA,2023_10K,Nvidia Corporation,ee4ed04f
2,What significant invention did NVIDIA create i...,NVIDIA invented the GPU in 1999.,Our invention of the GPU in 1999 defined moder...,NVDA,2023_10K,Nvidia Corporation,7eac6b57
3,How does NVIDIA's platform strategy contribute...,NVIDIA's platform strategy brings together har...,"NVIDIA has a platform strategy, bringing toget...",NVDA,2023_10K,Nvidia Corporation,eb49bbd0
4,What does NVIDIA's CUDA programming model enable?,NVIDIA's CUDA programming model opened the par...,With our introduction of the CUDA programming ...,NVDA,2023_10K,Nvidia Corporation,3e4c199c


In [10]:
# Create the embeddings for each record in our QA dataset
for record in tqdm(records):
    # Extract the text fields you want to embed along with threir combinations
    question = record['question']
    answer = record['answer']
    context = record['context']
    question_answer = question + ' ' + answer
    answer_context = answer + ' ' + context
    question_context = question + ' ' + context
    question_answer_context = question + ' ' + answer + ' ' + context
    
    # Create the embedding for each text field
    record['question_vector'] = model.encode(question)
    record['answer_vector'] = model.encode(answer)
    record['context_vector'] = model.encode(context)
    record['question_answer_vector'] = model.encode(question_answer)
    record['answer_context_vector'] = model.encode(answer_context)
    record['question_context_vector'] = model.encode(question_context)
    record['question_answer_context_vector'] = model.encode(question_answer_context)

100%|██████████| 6990/6990 [23:58<00:00,  4.86it/s]


In [12]:
# Create the Schema of the Elastic Search Index for vector search
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,            # Here we are using the dimensionality of the embedding we want to store 
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "context_vector": {
                "type": "dense_vector",
                "dims": 384,     
                "index": True,
                "similarity": "cosine"
            },
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "answer_context_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "question_context_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            },
            "question_answer_context_vector": {
                "type": "dense_vector",
                "dims": 384,             
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Provide the name of the index
index_name = "investment-info"
# Check if the index exists
if es_client.indices.exists(index=index_name):
    # Delete the existing index
    es_client.indices.delete(index=index_name)
# Create the elastic search index
response = es_client.indices.create(index=index_name, body=index_settings)
# Verify that elastic search is created
print(response)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'investment-info'}


In [None]:
# Fetch all the documents into the elastic search index
for record in tqdm(records):
    es_client.index(index = index_name, document=record)

In [None]:
# Create the new elastic seach query for the vector search

def vector_search(field, vector, company):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "company": company
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ['question', 'answer', 'context', 'ticker' ,'company', ,'id']
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
# Create the embeddings for the ground truth to use for validations
for record in tqdm(ground_truth):
    # Extract the question for each record
    question = record['question']
    # Create the embedding of each user query and store it in the ground truth records
    record['question_vector'] = model.encode(question)

### Hybrid Search

### Document Reranking