### RAG Fusion Trial

In [None]:
from pathlib import Path

In [None]:
current_working_directory = Path.cwd()

In [None]:
data_directory = current_working_directory.joinpath("datasets/halifax_intermediaries/")
data_directory.exists()

In [None]:
path = data_directory.joinpath("rephrased_questions.csv")
assert path.exists()

### Retrieval

In [None]:
from src.rag.components.embeddings.embeddings import EmbeddingComputer
embedding_model_name = "intfloat/multilingual-e5-large"
embedding_computer = EmbeddingComputer(model_name=embedding_model_name)

In [None]:
from src.rag.components.data_ingestion.utils import create_postgres_connection, create_postgres_connection_uri

In [None]:
from src.rag.components.shared.databases.postgres import PostgresVectorDBClient

In [None]:
connection_uri = create_postgres_connection_uri()
connection = create_postgres_connection(connection_uri)
postgres_client = PostgresVectorDBClient(
		connection=connection,
		namespace="halifax_intermediaries",
	)


In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv(path, index_col=[0, 1])
dataset.columns = ["rephrased_question"]

In [None]:
dataset.head()

In [None]:
question_returned_chunks = {}
for original in dataset.index.get_level_values(0).unique():
    subset = dataset.loc[original].rephrased_question
    subset  = "query: " + subset
    query_embeddings = embedding_computer.model.encode(
        subset.tolist(),
        convert_to_tensor=False,
        show_progress_bar=True,
        normalize_embeddings=True,
    )
    returned_chunks = postgres_client.search_many_by_vector(
        table_name="halifax_intermediaries_documents",
        query_vectors=query_embeddings.tolist(),
        return_columns=["content", "url"],
        candidate_limit=10,
        vector_column="embedding"
    )
    question_returned_chunks[original] = returned_chunks

In [None]:
question_returned_chunks_df = pd.DataFrame.from_dict(
    question_returned_chunks, orient="index").stack().apply(pd.Series).stack().apply(pd.Series)
question_returned_chunks_df.columns = ["similarity", "content", "url", "question_index"]

In [None]:
question_returned_chunks_df.index.shape


In [None]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """

    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for index, docs in results.items():
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            doc_text = doc[1], doc[2]  # Assuming doc is a tuple (similarity, content, url, question_index)
            if doc_text not in fused_scores:
                fused_scores[doc_text] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_text]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_text] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (doc, score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

In [None]:
question_fused_results = {}
for question, results in question_returned_chunks.items():
    results_dict = dict(results)
    fused_results = reciprocal_rank_fusion(results_dict, k=60)
    question_fused_results[question] = fused_results[:10]  # Get top 10 results


In [None]:
question_fused_results_df = pd.DataFrame.from_dict(question_fused_results, orient='index' ).stack().apply(pd.Series)

In [None]:
question_fused_results_df.head()

In [None]:
question_fused_results_df["content"] = question_fused_results_df[0].apply(pd.Series)[0]
question_fused_results_df["url"] = question_fused_results_df[0].apply(pd.Series)[1]
question_fused_results_df["fused_results"] = question_fused_results_df[1]
question_fused_results_df = question_fused_results_df[["content", "url", "fused_results"]]

In [None]:
question_fused_results_df.head()

In [None]:
question_fused_results_df.to_csv(current_working_directory.joinpath(f"datasets/halifax_intermediaries/question_fused_results_df_{embedding_model_name.split('/')[-1]}.csv"))