## **InformationRetrievalEvaluator** from sentence_transformers
This provides a more comprehensive suite of metrics, but we can only run it against the sentencetransformers compatible models (open source and our finetuned model, not the OpenAI embedding model).

In [57]:
dataset_name = "QA/gpt-35-turbo_dataset.json"

In [58]:
from utils.utils_embedding import load_qa_dataset

dataset = load_qa_dataset(dataset_name)

[32m2025-07-16 17:50:57.838[0m | [1mINFO    [0m | [36mutils.utils_embedding[0m:[36mload_qa_dataset[0m:[36m15[0m - [1mLoading dataset from QA/gpt-35-turbo_dataset.json[0m


In [59]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

def evaluate_sentence_transformer(dataset, model_id, name):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs
    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [60]:
model_id = "finetuned-sentence-transformers/finetuned-paraphrase-multilingual-MiniLM-L12-v2"
dict_results = evaluate_sentence_transformer(dataset, model_id=model_id, name="finetuned-paraphrase-multilingual-MiniLM-L12-v2")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: finetuned-sentence-transformers/finetuned-paraphrase-multilingual-MiniLM-L12-v2
Load pretrained SentenceTransformer: finetuned-sentence-transformers/finetuned-paraphrase-multilingual-MiniLM-L12-v2
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the finetuned-paraphrase-multilingual-MiniLM-L12-v2 dataset:
Information Retrieval Evaluation of the model on the finetuned-paraphrase-multilingual-MiniLM-L12-v2 dataset:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 260
Queries: 260
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 260

Corpus: 260

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transforme

In [61]:
model_id = "paraphrase-multilingual-MiniLM-L12-v2"
dict_results = evaluate_sentence_transformer(dataset, model_id=model_id, name="paraphrase-multilingual-MiniLM-L12-v2")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the paraphrase-multilingual-MiniLM-L12-v2 dataset:
Information Retrieval Evaluation of the model on the paraphrase-multilingual-MiniLM-L12-v2 dataset:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 260
Queries: 260
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 260

Corpus: 260

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@1: 45.38%
Accuracy@1: 45.38%
INFO:sentence_transfor

In [62]:
import pandas as pd

df_finetuned = pd.read_csv("results/Information-Retrieval_evaluation_finetuned-paraphrase-multilingual-MiniLM-L12-v2_results.csv")
df_finetuned

Unnamed: 0,epoch,steps,cosine-Accuracy@1,cosine-Accuracy@3,cosine-Accuracy@5,cosine-Accuracy@10,cosine-Precision@1,cosine-Recall@1,cosine-Precision@3,cosine-Recall@3,cosine-Precision@5,cosine-Recall@5,cosine-Precision@10,cosine-Recall@10,cosine-MRR@10,cosine-NDCG@10,cosine-MAP@100
0,-1,-1,0.611538,0.769231,0.823077,0.85,0.611538,0.611538,0.25641,0.769231,0.164615,0.823077,0.085,0.85,0.697718,0.735158,0.702163


In [63]:
df_no_finetuned = pd.read_csv("results/Information-Retrieval_evaluation_paraphrase-multilingual-MiniLM-L12-v2_results.csv")
df_no_finetuned

Unnamed: 0,epoch,steps,cosine-Accuracy@1,cosine-Accuracy@3,cosine-Accuracy@5,cosine-Accuracy@10,cosine-Precision@1,cosine-Recall@1,cosine-Precision@3,cosine-Recall@3,cosine-Precision@5,cosine-Recall@5,cosine-Precision@10,cosine-Recall@10,cosine-MRR@10,cosine-NDCG@10,cosine-MAP@100
0,-1,-1,0.453846,0.607692,0.653846,0.707692,0.453846,0.453846,0.202564,0.607692,0.130769,0.653846,0.070769,0.707692,0.537966,0.57911,0.544551


In [68]:
df_improvement = (df_finetuned - df_no_finetuned)/ df_no_finetuned * 100
df_improvement.to_csv("results/Information-Retrieval_evaluation_improvement.csv")
df_improvement

Unnamed: 0,epoch,steps,cosine-Accuracy@1,cosine-Accuracy@3,cosine-Accuracy@5,cosine-Accuracy@10,cosine-Precision@1,cosine-Recall@1,cosine-Precision@3,cosine-Recall@3,cosine-Precision@5,cosine-Recall@5,cosine-Precision@10,cosine-Recall@10,cosine-MRR@10,cosine-NDCG@10,cosine-MAP@100
0,-0.0,-0.0,34.745763,26.582278,25.882353,20.108696,34.745763,34.745763,26.582278,26.582278,25.882353,25.882353,20.108696,20.108696,29.695723,26.946271,28.943414
