In [None]:
from pathlib import Path
import pandas as pd

## Ingestion

In [None]:
current_working_directory = Path().cwd()

In [None]:
data_path = current_working_directory.joinpath("datasets/halifax_intermediaries/data.csv")

In [None]:
data = pd.read_csv(data_path, index_col=0)

In [None]:
data = data.reset_index(drop=False, names="doc_id")

In [None]:
data =  data.drop(columns=["length"], axis="columns")

In [None]:
from typing import  Dict

In [None]:
def to_sql_schema(embedding_dimension: int, table_prefix: str) -> Dict[str, str]:
	return {
            "doc_id": "TEXT PRIMARY KEY",
         			"content": "TEXT",
         			"url": "text",
         			"embedding": f"vector({embedding_dimension})",
         			# Add other fields as needed
        }

In [None]:
from src.rag.components.data_ingestion.utils import create_postgres_connection, create_postgres_connection_uri

In [None]:
from src.rag.components.shared.databases.postgres import PostgresVectorDBClient

In [None]:
connection_uri = create_postgres_connection_uri()
connection = create_postgres_connection(connection_uri)
postgres_client = PostgresVectorDBClient(
		connection=connection,
		namespace="halifax_intermediaries",
	)


In [None]:
EMBEDDING_DIMENSION = 1024

In [None]:
schemas = to_sql_schema(EMBEDDING_DIMENSION, "halifax_intermediaries")

In [None]:
postgres_client.create_table(
    name="halifax_intermediaries_documents", schema=schemas, if_not_exists=True,
)

In [None]:
postgres_client.add_text_search_field(
    table_name="halifax_intermediaries_documents", column_name="content"
)

In [None]:
embedding_model_name = "intfloat/multilingual-e5-large"

In [None]:
from src.rag.components.embeddings.embeddings import EmbeddingComputer

In [None]:
embedding_computer = EmbeddingComputer(model_name=embedding_model_name)

In [None]:
from typing import List, Dict

In [None]:
def assign_embeddings(
	nodes: List[Dict], embeddings: List[List[float]]
) -> List[Dict]:
	"""Assign embeddings to nodes"""
	for node, embedding in zip(nodes, embeddings):
		node["embedding"] = embedding.tolist()
	return nodes


In [None]:
def compute_embeddings_in_batch(all_nodes: List[Dict], batch_size: int) -> List[Dict]:
	"""Process nodes in batches and compute embeddings. Return a list of nodes with embeddings."""
	all_embeddings = []
	print(f"I am dealing with {len(all_nodes)} nodes")
	for i in range(0, len(all_nodes), batch_size):
		batch_nodes = all_nodes[i: i + batch_size]
		texts = [f"passage: {node.get('content')}" for node in batch_nodes]
		batch_embedding = embedding_computer.model.encode(
			texts,
			convert_to_tensor=False,
			show_progress_bar=True,
			normalize_embeddings=True,
		)
		all_embeddings.extend(batch_embedding)
	nodes_with_embedding = assign_embeddings(all_nodes, all_embeddings)
	print(f"Computed embeddings for {len(nodes_with_embedding)} nodes")
	return nodes_with_embedding

In [None]:
chunk_nodes = compute_embeddings_in_batch(
    data.to_dict(orient="records"), batch_size=8
)

In [None]:
inserted_chunk_nodes = postgres_client.bulk_insert(
    table_name="halifax_intermediaries_documents", data=chunk_nodes, returning=["doc_id"]
)

Done with the ingestion, need to sort out 

In [None]:
postgres_client.create_embedding_index(
    table_name="halifax_intermediaries_documents", column_name="embedding", index_config="USING vchordrq")

In [None]:
postgres_client.create_full_text_index(
    table_name="halifax_intermediaries_documents", column_name="content")