<a href="https://colab.research.google.com/github/giambono/divine_semantics/blob/main/notebooks/run_compute_qdrant_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Check if the notebook is running on Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Clone the repository if running on Colab
if is_colab():
    print("Running on Google Colab. Cloning repository...")
    !git clone https://github.com/giambono/divine_semantics.git
    os.chdir("/content/divine_semantics")
    !pip install -r requirements.txt
else:
    import sys
    sys.path.append("..")
    print(f"Working directory set to: {os.getcwd()}")
    print("Not running on Google Colab.")

In [2]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient

load_dotenv()

qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)

In [3]:
import ast
import numpy as np
import pandas as pd
import src.compute_embeddings as compute_embeddings
from src.db_helper import get_db_connection
from src.utils import load_model


# **Choose Dataframe to embedd**

In [None]:

# conn = get_db_connection()  # Ensure get_db_connection() is defined/imported
# df = pd.read_sql_query("SELECT * FROM divine_comedy", conn)
# verse_mappings = pd.read_sql_query("SELECT * FROM verse_mappings", conn)

# # Merge the cumulative_indices column from verse_mappings into df
# df = df.merge(
#     verse_mappings[['cantica_id', 'canto', 'start_verse', 'end_verse', 'cumulative_indices']],
#     on=['cantica_id', 'canto', 'start_verse', 'end_verse'],
#     how='left'
# )

# # Convert the string to a list
# df['cumulative_indices'] = df['cumulative_indices'].apply(
#     lambda x: ast.literal_eval(x) if pd.notnull(x) else []
# )


In [13]:
df = pd.read_csv("data/parafrasi.csv", sep=";")

In [14]:
df.columns

Index(['cantica', 'canto', 'cum_verse_number', 'verse_number', 'verse_start',
       'verse_end', 'text', 'author', 'style', 'temperature', 'prompt'],
      dtype='object')

In [10]:
qdrant_client.get_collections()
#qdrant_client.delete_collection(collection_name="dante_multilingual_e5")

CollectionsResponse(collections=[CollectionDescription(name='dante_fake_text'), CollectionDescription(name='dante_multilingual_e5'), CollectionDescription(name='dante_multilingual_e5_optim_weights')])

# **Compute embeddings and upload to qdrant**




In [15]:
model_key = "multilingual_e5"
models = {model_key: load_model(model_key)}

payload_columns = ['cantica', 'canto', 'cum_verse_number', 'verse_number', 'verse_start',
       'verse_end', 'text', 'author', 'style', 'temperature', 'prompt']
compute_embeddings.compute_embeddings_and_upsert(df, models, qdrant_client, collection_name_prefix="dante_parafrasi_", payload_columns=payload_columns)

Computing embeddings with multilingual_e5...
Upserted batch 1 containing 100 points
Upserted batch 2 containing 100 points
Upserted batch 3 containing 100 points
Upserted batch 4 containing 100 points
Upserted batch 5 containing 100 points
Upserted batch 6 containing 100 points
Upserted batch 7 containing 100 points
Upserted batch 8 containing 100 points
Upserted batch 9 containing 100 points
Upserted batch 10 containing 100 points
Upserted batch 11 containing 100 points
Upserted batch 12 containing 100 points
Upserted batch 13 containing 100 points
Upserted batch 14 containing 100 points
Upserted batch 15 containing 100 points
Upserted batch 16 containing 100 points
Upserted batch 17 containing 100 points
Upserted batch 18 containing 100 points
Upserted batch 19 containing 100 points
Upserted batch 20 containing 100 points
Upserted batch 21 containing 100 points
Upserted batch 22 containing 100 points
Upserted batch 23 containing 100 points
Upserted batch 24 containing 100 points
Upse

In [None]:
collection = qdrant_client.get_collection("dante_parafrasi_multilingual_e5")