<a href="https://colab.research.google.com/github/giambono/divine_semantics/blob/main/run_compute_qdrant_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Check if the notebook is running on Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Clone the repository if running on Colab
if is_colab():
    print("Running on Google Colab. Cloning repository...")
    !git clone https://github.com/giambono/divine_semantics.git
    os.chdir("/content/divine_semantics")
    !pip install -r requirements.txt
else:
    import sys
    sys.path.append("..")
    print(f"Working directory set to: {os.getcwd()}")
    print("Not running on Google Colab.")

Working directory set to: /home/rfflpllcn/IdeaProjects/divine_semantics/notebooks
Not running on Google Colab.


In [1]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient

load_dotenv()

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)

In [3]:
import numpy as np
import pandas as pd
import src.compute_embeddings as compute_embeddings
from src.db_helper import get_db_connection
from src.utils import load_model


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

conn = get_db_connection()  # Ensure get_db_connection() is defined/imported
df = pd.read_sql_query("SELECT * FROM divine_comedy", conn)
verse_mappings = pd.read_sql_query("SELECT * FROM verse_mappings", conn)

# Merge the cumulative_indices column from verse_mappings into df
df = df.merge(
    verse_mappings[['cantica_id', 'canto', 'start_verse', 'end_verse', 'cumulative_indices']],
    on=['cantica_id', 'canto', 'start_verse', 'end_verse'],
    how='left'
)

# Convert the string to a list
df['cumulative_indices'] = df['cumulative_indices'].apply(
    lambda x: ast.literal_eval(x) if pd.notnull(x) else []
)


In [4]:
qdrant_client.get_collections()
#qdrant_client.delete_collection(collection_name="dante_multilingual_e5")

CollectionsResponse(collections=[CollectionDescription(name='dante_fake_text')])

In [None]:
model_key = "fake_text"
models = {model_key: load_model(model_key)}

compute_embeddings.compute_embeddings_and_upsert(df, models, qdrant_client, collection_name_prefix="dante_")

In [4]:
collection = qdrant_client.get_collection("dante_multilingual_e5")

In [9]:
all_points = []
offset = None  # Qdrant scroll API uses None as the initial offset

# Retrieve all points from the collection in batches
while True:
    scroll_result, next_offset = qdrant_client.scroll(
        collection_name="dante_multilingual_e5",
        limit=100,
        offset=offset,
        with_vectors=True,
        with_payload=True
    )
    if not scroll_result:
        break

    all_points.extend(scroll_result)
    offset = next_offset  # Use next_offset for pagination
    if next_offset is None:
        break

In [15]:
import numpy as np
v = np.array(all_points[0].vector)
norm = np.linalg.norm(v)
norm

np.float64(0.9999999889255714)