<a href="https://colab.research.google.com/github/giambono/divine_semantics/blob/main/notebooks/run_compute_qdrant_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Check if the notebook is running on Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Clone the repository if running on Colab
if is_colab():
    print("Running on Google Colab. Cloning repository...")
    !git clone https://github.com/giambono/divine_semantics.git
    os.chdir("/content/divine_semantics")
    !pip install -r requirements.txt
else:
    import sys
    sys.path.append("..")
    print(f"Working directory set to: {os.getcwd()}")
    print("Not running on Google Colab.")

Running on Google Colab. Cloning repository...
Cloning into 'divine_semantics'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 92 (delta 33), reused 76 (delta 20), pack-reused 0 (from 0)[K
Receiving objects: 100% (92/92), 1.58 MiB | 7.81 MiB/s, done.
Resolving deltas: 100% (33/33), done.
Collecting scikit-optimize (from -r requirements.txt (line 6))
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting mysql-connector-python (from -r requirements.txt (line 7))
  Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.0 kB)
Collecting python-dotenv (from -r requirements.txt (line 8))
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting qdrant-client (from -r requirements.txt (line 9))
  Downloading qdrant_client-1.13.2-py3-none-any.whl.metadata (10 kB)
Collecting datasets (from -r re

In [2]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient

load_dotenv()

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)

In [6]:
import ast
import numpy as np
import pandas as pd
import src.compute_embeddings as compute_embeddings
from src.db_helper import get_db_connection
from src.utils import load_model


In [7]:

conn = get_db_connection()  # Ensure get_db_connection() is defined/imported
df = pd.read_sql_query("SELECT * FROM divine_comedy", conn)
verse_mappings = pd.read_sql_query("SELECT * FROM verse_mappings", conn)

# Merge the cumulative_indices column from verse_mappings into df
df = df.merge(
    verse_mappings[['cantica_id', 'canto', 'start_verse', 'end_verse', 'cumulative_indices']],
    on=['cantica_id', 'canto', 'start_verse', 'end_verse'],
    how='left'
)

# Convert the string to a list
df['cumulative_indices'] = df['cumulative_indices'].apply(
    lambda x: ast.literal_eval(x) if pd.notnull(x) else []
)


In [9]:
qdrant_client.get_collections()
#qdrant_client.delete_collection(collection_name="dante_multilingual_e5")

ResponseHandlingException: [Errno 99] Cannot assign requested address

In [None]:
model_key = "multilingual_e5"
models = {model_key: load_model(model_key)}

compute_embeddings.compute_embeddings_and_upsert(df, models, qdrant_client, collection_name_prefix="dante_")

In [None]:
collection = qdrant_client.get_collection("dante_multilingual_e5")

In [None]:
all_points = []
offset = None  # Qdrant scroll API uses None as the initial offset

# Retrieve all points from the collection in batches
while True:
    scroll_result, next_offset = qdrant_client.scroll(
        collection_name="dante_multilingual_e5",
        limit=100,
        offset=offset,
        with_vectors=True,
        with_payload=True
    )
    if not scroll_result:
        break

    all_points.extend(scroll_result)
    offset = next_offset  # Use next_offset for pagination
    if next_offset is None:
        break

In [None]:
import numpy as np
v = np.array(all_points[0].vector)
norm = np.linalg.norm(v)
norm