In [None]:
from pathlib import Path

In [None]:
current_working_directory = Path.cwd()

In [None]:
data_directory = current_working_directory.joinpath("datasets/halifax_intermediaries/")
data_directory.exists()

### Retrieval

In [None]:
from src.rag.components.embeddings.embeddings import EmbeddingComputer
embedding_model_name = "intfloat/multilingual-e5-large"
embedding_computer = EmbeddingComputer(model_name=embedding_model_name)

In [None]:
tcs_data_path = data_directory.joinpath("test_questions.csv")

In [None]:
other_data_path = data_directory.joinpath("qa_question.csv")

In [None]:
tcs_data_path.exists()

In [None]:
other_data_path.exists()

In [None]:
import pandas as pd

In [None]:
other_query_df = pd.read_csv(other_data_path, index_col=[1, 2]).drop(columns=["Unnamed: 0"])

In [None]:
other_query_df.head()

In [None]:
tcs_query_df = pd.read_csv(tcs_data_path, index_col=0)
tcs_query_df.columns = ["Question"]

In [None]:
tcs_query_df["Question_Index"] = 0
tcs_query_df["Conversation_Index"] = 100 + tcs_query_df.index

In [None]:
tcs_query_df = tcs_query_df.set_index(["Conversation_Index", "Question_Index"])

In [None]:
question_df = pd.concat([other_query_df, tcs_query_df]).sort_index()

In [None]:
question_df.head()

In [None]:
question_df.shape

In [None]:
question_df.Question = question_df.Question.str.replace("QUESTION: ", "")

In [None]:
initial_query_df = question_df.query("Question_Index == 0")

In [None]:
from src.rag.components.data_ingestion.utils import create_postgres_connection, create_postgres_connection_uri

In [None]:
from src.rag.components.shared.databases.postgres import PostgresVectorDBClient

In [None]:
connection_uri = create_postgres_connection_uri()
connection = create_postgres_connection(connection_uri)
postgres_client = PostgresVectorDBClient(
		connection=connection,
		namespace="halifax_intermediaries",
	)


In [None]:
query_embeddings = embedding_computer.model.encode(
    "query: " + initial_query_df.Question.values,
			convert_to_tensor=False,
			show_progress_bar=True,
			normalize_embeddings=True,
		)

In [None]:
query_embeddings.shape

In [None]:
postgres_client.create_embedding_index(
    table_name="halifax_intermediaries_documents",
    column_name="embedding",
    index_config="USING hnsw", if_not_exists=True, unique=True
)

In [None]:
for embedding in query_embeddings.tolist():
    retrieved_text = postgres_client.search_by_vector(
        vector_column="embedding",
        query_vector=embedding,
        table_name="halifax_intermediaries_documents",
        return_columns=["content", "url"],
    )

In [None]:
print(retrieved_text.keys())

In [None]:
def flatten(row):
    return pd.Series([item for item in row])


In [None]:
question_paragraphs_df = pd.DataFrame.from_dict(retrieved_text, orient="index").stack().apply(flatten)

In [None]:
question_paragraphs_df = question_paragraphs_df.reset_index(drop=True)
question_paragraphs_df.columns = ["cosine_similarity", "content", "url", "query_index"]

In [None]:
question_paragraphs_df.query_index = question_paragraphs_df.query_index - 1

In [None]:
question_paragraphs_df.head(10)

In [None]:
initial_query_df = initial_query_df.reset_index()

In [None]:
question_paragraph_answers = question_paragraphs_df.merge(
    initial_query_df, left_on="query_index", right_on="Conversation_Index")

In [None]:
question_paragraph_answers.head()

In [None]:
question_paragraph_answers = question_paragraph_answers.sort_values(by=["query_index", "Question", "cosine_similarity"], ascending=[
                                                                 True, True, False], inplace=False).set_index(["Question", "query_index"])

In [None]:
question_paragraph_answers.head(20)


In [None]:
question_paragraph_answers.shape

In [None]:
question_paragraph_answers.to_csv(data_directory.joinpath( f"question_and_paragraph_{embedding_model_name.split('/')[-1]}.csv"))

### Next step.

Now perform keyword search.

Extract keywords from the next. Update the keyword search model using yet anothre keykword extractor.

Do a keyword search when we get a question and update it with semantic search.

### ### Things to Try after the first baseline.

Forget the Keyword search, but think about how to make RAG better.

- https://raghunaathan.medium.com/query-translation-for-rag-retrieval-augmented-generation-applications-46d74bff8f07

- https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/rag/rag-information-retrieval

- But we can add keywords in chunks.

- https://www.youtube.com/watch?v=77qELPbNgxA
- Try  https://maartengr.github.io/KeyBERT/ for keyword extraction

In [None]:
query_embeddings[:5, :5].shape

In [None]:
postgres_client.search_many_by_vector(table_name="documents", vector_column="embedding", query_vectors=query_embeddings[:5, :5], return_columns=["content", "url"], candidate_limit=5)