In [1]:
import os

# Check if the notebook is running on Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Clone the repository if running on Colab
if is_colab():
    print("Running on Google Colab. Cloning repository...")
    !git clone https://github.com/giambono/divine_semantics.git
    os.chdir("/content/divine_semantics")
    !pip install -r requirements.txt
else:
    import sys
    sys.path.append("..")
    print(f"Working directory set to: {os.getcwd()}")
    print("Not running on Google Colab.")

Working directory set to: /home/rfflpllcn/IdeaProjects/divine_semantics/notebooks
Not running on Google Colab.


In [3]:
import os

import config
from src.optimize_weights import get_search_space, create_loss_function, optimize_embedding_weights
from src.utils import load_model, setup_environment, initialize_qdrant_client, initialize_model, load_test_queries

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [None]:
def get_fixed_parameters():
    """
    Returns fixed evaluation parameters.
    """
    author_name_ids = {"dante": 1, "singleton": 2, "musa": 3, "kirkpatrick": 4, "durling": 5}
    author_ids = [1, 2, 3, 4, 5]
    type_ids = 1
    return author_name_ids, author_ids, type_ids


In [None]:
# Setup environment and clients.
setup_environment()
qdrant_client = initialize_qdrant_client()

# Initialize model and determine embedding dimension.
model_name = "multilingual_e5"
model, embedding_dim = initialize_model(model_name)

# Define collection name and load test queries.
collection_name = "dante_multilingual_e5"
test_queries_path = os.path.join(config.ROOT, "data/paraphrased_verses.parquet")
test_queries = load_test_queries(test_queries_path, n=2)

# Get fixed evaluation parameters.
author_name_ids, author_ids, type_ids = get_fixed_parameters()

# Setup search space and loss function.
columns = ["musa", "kirkpatrick", "durling"]
space = get_search_space(columns)
loss = create_loss_function(space, qdrant_client, collection_name, model, embedding_dim,
                            author_name_ids, author_ids, type_ids, test_queries, model_name)

# Optimize and print the best weights.
best_weights = optimize_embedding_weights(loss, space, columns)
print("Optimized embedding weights:", best_weights)
