<a href="https://colab.research.google.com/github/giambono/divine_semantics/blob/main/notebooks/run_performance_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Check if the notebook is running on Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Clone the repository if running on Colab
if is_colab():
    print("Running on Google Colab. Cloning repository...")
    !git clone https://github.com/giambono/divine_semantics.git
    os.chdir("/content/divine_semantics")
    !pip install -r requirements.txt
else:
    import sys
    sys.path.append("..")
    print(f"Working directory set to: {os.getcwd()}")
    print("Not running on Google Colab.")

In [5]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient

load_dotenv()

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)

In [7]:
import ast
import numpy as np
import pandas as pd
from src.query import run_evaluation
from src.utils import load_model

import config

In [9]:
    # Define constants for filtering
N = 1000
author_ids = [1, 2, 3, 4, 5]
type_ids = 1

# Load test queries and sample N rows
path = os.path.join(config.ROOT, "data/paraphrased_verses.parquet")
test_queries = pd.read_parquet(path)
test_queries_sample = test_queries.sample(n=N)[["transformed_text", "expected_index"]]

# Initialize Qdrant client and model
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)

collection_name = "dante_multilingual_e5"
model = load_model("multilingual_e5")

out_collect, performance = run_evaluation(qdrant_client, collection_name, model, author_ids, type_ids,
                                          test_queries_sample)

out_collect_df = pd.DataFrame(out_collect, columns=["query_text", "is_correct"])
out_collect_df.to_csv("output.csv", index=False)

print(f"True count: {performance * 100:.2f}%")


True count: 78.30%
