In [None]:
import os
from qdrant_client import QdrantClient
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file

# openai.api_Key = os.environ.get("OPENAI_API_KEY")
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")

In [13]:
from qdrant_client.http.models import Distance, VectorParams

# client = QdrantClient(
#     url="https://db.ncdedinsky.com",
#     port=443,
#     api_key=QDRANT_API_KEY,
# )

client = QdrantClient(
    url="http://localhost",
    port=6333,
    api_key=QDRANT_API_KEY,
)

index_name = 'song-embeddings-index'

#client.create_collection(
#    collection_name=index_name,
#    vectors_config=VectorParams(size=11904, distance=Distance.COSINE),
#)

client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='song-embeddings-index')])

In [18]:
import os
import uuid
import hashlib
import itertools
import numpy as np
from qdrant_client.http.models import Batch


def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


def filename_to_uuid(filename):
    # Hash the filename using SHA-1
    sha1_hash = hashlib.sha1(filename.encode()).digest()

    # Use the first 16 bytes of the hash to create a UUID
    return str(uuid.UUID(bytes=sha1_hash[:16]))


def get_embeddings(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('_mel_embedding.npy'):
                file_path = os.path.join(root, file)
                embedding = np.load(file_path)
                # Convert the filename to a UUID
                spotify_id = file.split('_mel_embedding.npy')[0]
                file_uuid = filename_to_uuid(
                    file.split('_mel_embedding.npy')[0])
                yield (file_uuid, embedding.tolist(), spotify_id)


directories = [
    #"../../data/embeddings/test",
    #"../../data/embeddings/train"
    #"../../data/embeddings/validation",
    #"../../data/embeddings/parts1to5",
    #"../../data/embeddings/parts6to21",
    #"../../data/embeddings/parts22to30",
    #"../../data/embeddings/parts31to46/part1",
    #"../../data/embeddings/parts31to46/part2",
    #"../../data/embeddings/parts31to46/part3",
]

# Process and upsert embeddings
for directory in directories:
    for batch in chunks(get_embeddings(directory), batch_size=150):
        ids = [id_ for id_, _, _ in batch]
        vectors = [vector for _, vector, _ in batch]
        for _,_, spotify_id in batch:
            if len(spotify_id) != 22:
                print(spotify_id)
                exit()

        spotify_ids = [{"spotify_id": spotify_id}
                       for _, _, spotify_id in batch]

        client.upsert(
            collection_name=index_name,
            wait=False,
            points=Batch(
                ids=ids,
                payloads=spotify_ids,
                vectors=vectors,
            ),
        )