In [None]:
from pathlib import Path

In [None]:
dataset = Path.cwd().joinpath("datasets/halifax_intermediaries")

In [None]:
import pandas as pd

In [None]:
test_question = pd.read_csv(dataset.joinpath("test_questions.csv"), index_col=0)
test_question.head(3)

In [None]:
chunks_df  = pd.read_csv(dataset.joinpath("data.csv"), index_col=0)
chunks_df.head(3)

In [None]:
first_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(first_embedding_model)

In [None]:
question_embeddings = model.encode(test_question["TCS Question"].tolist(), show_progress_bar=True, convert_to_tensor=True)

In [None]:
chunks_embedding = model.encode(chunks_df["content"].tolist(), show_progress_bar=True, convert_to_tensor=True)

In [None]:
# find the cosine similarity between question embeddings and chunks embeddings
from sentence_transformers.util import cos_sim
cosine_similarities = cos_sim(question_embeddings, chunks_embedding).to("cpu").numpy()

In [None]:
import numpy as np

In [None]:
"""for question, indices, similarities in zip(test_question["TCS Question"].tolist(), top_k_indices, top_k_similarities):
    print("the question is", question)
    for chunk, similary in zip(chunks_df.content.values[indices], similarities):
        print(f"The chunk is => {chunk} the similarity is {similary}")
    print(10 * " **")"""

Need to continue the exercise tommorow.

At this point, let me try two other approaches,a better embedding model and next the second level chunking. And evaluate the results.

In [None]:
def analyse_embedding(chunks_embedding, question_embeddings, top_k):
    """ 
    
    """
    cosine_similarities = cos_sim(
        question_embeddings, chunks_embedding).to("cpu").numpy()
    top_k_indices = np.argsort(cosine_similarities, axis=1)[:, -top_k:][:, ::-1]
    top_k_similarities = np.take_along_axis(
        cosine_similarities, top_k_indices, axis=1)
    mean_cosine_similarity = top_k_similarities.mean(axis=1)
    return top_k_indices, top_k_similarities, mean_cosine_similarity

In [None]:
mean_cosine_similarity = analyse_embedding(chunks_embedding=chunks_embedding, question_embeddings=question_embeddings, top_k=5)

In [None]:
mean_cosine_similarity.mean()

In [None]:
embedding_model_id = "dunzhang/stella_en_400M_v5"


In [None]:
model_path = Path.cwd().joinpath("models")

In [None]:
stella_embedding_model = SentenceTransformer(model_path.joinpath(embedding_model_id).__str__(), trust_remote_code=True, config_kwargs={"use_memory_efficient_attention": False,
                                        "unpad_inputs": False},)

In [None]:
questions = test_question["TCS Question"].tolist()
chunks = chunks_df["content"].tolist()
question_embeddings = stella_embedding_model.encode(questions, show_progress_bar=True, convert_to_tensor=True)
chunks_embedding = stella_embedding_model.encode(
    chunks, show_progress_bar=True, convert_to_tensor=True)
top_k_indices, top_k_similarities, mean_cosine_similarity = analyse_embedding(
    chunks_embedding=chunks_embedding, question_embeddings=question_embeddings, top_k=5)

In [None]:
mean_cosine_similarity.mean()

By changing the embedding there is 10 % improvement in the overall means cosine similarity for the top 5 K.

In [None]:
# Assuming:
# questions: list of question strings
# chunks: list of chunk strings
# top_k_indices: numpy array of shape (num_questions, 5)
# top_k_similarities: numpy array of shape (num_questions, 5)

rows = []
for i, question in enumerate(questions):
    for rank in range(5):
        chunk_idx = top_k_indices[i, rank]
        rows.append({
            "question": question,
            "chunk_rank": rank + 1,
            "chunk": chunks[chunk_idx],
            "cosine_similarity": top_k_similarities[i, rank]
        })

result_df = pd.DataFrame(rows)

In [None]:
result_df.to_csv(dataset.joinpath("halifax_intermediaries_results_stella_m3_embedding.csv"), index=False)

In [None]:
result_df

In [None]:
result_dict = result_df.groupby('question').apply(lambda x: list(zip(x['chunk'], x['cosine_similarity'])))

In [None]:
result_dict.to_clipboard()