In [1]:
import numpy as np

from helper_utils import load_chroma, word_wrap, project_embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [None]:
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_collection = load_chroma(filename          = r'./data/microsoft_annual_report_2022.pdf', 
                                collection_name   = 'microsoft_annual_report_2022', 
                                embedding_function= embedding_function)
chroma_collection.count()

In [None]:
query = "What has been the investment in research and development?"

In [None]:
# more number of retrieved documents
results = chroma_collection.query(query_texts= query, 
                                  n_results  = 10, 
                                  include    = ['documents', 'embeddings'])

In [None]:
retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

In [1]:
from sentence_transformers import CrossEncoder

In [None]:
cross_encoder = CrossEncoder(model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2',
                             cache_dir  = r'D:\AI-DATASETS\07-Hugging-Face-Data')

In [None]:
pairs = [[query, doc] for doc in retrieved_documents]

In [None]:
scores = cross_encoder.predict(pairs)

In [None]:
print("Scores:")
for score in scores:
    print(score)

In [None]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

#### Re-ranking with Query Expansion

In [2]:
original_query = "What were the most important factors that contributed to increases in revenue?"

In [3]:
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [4]:
queries = [original_query] + generated_queries

In [None]:
results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents']

In [None]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [None]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [None]:
scores = cross_encoder.predict(pairs)

In [None]:
print("Scores:")
for score in scores:
    print(score)

In [None]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

| Column 1 | Column 2 |
|----------|----------|
|    2     |    20    |
|    1     |    17    |
|    3     |    2     |
|    7     |    12    |
|    6     |    8     |
|    9     |    0     |
|   10     |    6     |
|    5     |    15    |
|    4     |    3     |
|    8     |    10    |
|          |    11    |
|          |    16    |
|          |    18    |
|          |    21    |
|          |    14    |
|          |    13    |
|          |    7     |
|          |    5     |
|          |    4     |
|          |    19    |
|          |    9     |
|          |    1     |
