In [20]:

from dotenv import load_dotenv
import os
import os
import pandas as pd
import kagglehub
import numpy as np
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

load_dotenv()

# Connect to Milvus
connections.connect(alias="default", host="localhost", port="19530")
print("Connected to Milvus!")

DATASET_PATH = '/Users/devyanigauri/.cache/kagglehub/datasets/yamaerenay/spotify-dataset-1921-2020-160k-tracks/versions/1/data.csv'





Connected to Milvus!


In [None]:
# Check if the dataset exists locally, load if it does, otherwise download

if not os.path.exists(DATASET_PATH):
    print("Dataset not found locally. Downloading from KaggleHub...")
    kagglehub.dataset_download('yamaerenay/spotify-dataset-1921-2020-160k-tracks')
else:
    print(f"Dataset found locally at {DATASET_PATH}.")

df = pd.read_csv(DATASET_PATH, on_bad_lines='skip')
print(f"Loaded dataset with {len(df)} rows.")

Dataset found locally at /Users/devyanigauri/.cache/kagglehub/datasets/yamaerenay/spotify-dataset-1921-2020-160k-tracks/versions/1/data.csv.


  from .autonotebook import tqdm as notebook_tqdm


Loaded dataset with 170653 rows.


In [4]:
df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [5]:
# Splitting columns into metadata and embedding
metadata_columns = [
    'id', 'name', 'artists', 'year', 'release_date', 'explicit',
    'popularity', 'duration_ms', 'key', 'mode'
]

embedding_columns = [
    'valence', 'acousticness', 'danceability', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo'
]


In [None]:
from sentence_transformers import SentenceTransformer

# Load nomic model for embeddings
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

<All keys matched successfully>


In [7]:
import json

# Create JSON column for embedding input
def row_to_json(row):
    return json.dumps({col: row[col] for col in embedding_columns})

df['embedding_json'] = df.apply(row_to_json, axis=1)

# Batch embedding creation
batch_size = 1000
jsons = df['embedding_json'].tolist()
embeddings = []
for i in range(0, len(jsons), batch_size):
    batch = jsons[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}")
    batch_embeds = model.encode(batch, show_progress_bar=True)
    embeddings.extend(batch_embeds)
df['embedding'] = embeddings
print("Batch Nomic embeddings added to DataFrame.")

Processing batch 1


Batches: 100%|██████████| 32/32 [00:21<00:00,  1.50it/s]



Processing batch 2


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.43it/s]



Processing batch 3


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.45it/s]



Processing batch 4


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.45it/s]



Processing batch 5


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.43it/s]



Processing batch 6


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 7


Batches: 100%|██████████| 32/32 [00:29<00:00,  1.09it/s]



Processing batch 8


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]



Processing batch 9


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.39it/s]



Processing batch 10


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.41it/s]



Processing batch 11


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.43it/s]



Processing batch 12


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.43it/s]



Processing batch 13


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]



Processing batch 14


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Processing batch 15


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]


Processing batch 16


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 17


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 18


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 19


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 20


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 21


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 22


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]


Processing batch 23


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 24


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 25


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 26


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]


Processing batch 27


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 28


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]


Processing batch 29


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 30


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]



Processing batch 31


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]


Processing batch 32


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.27it/s]



Processing batch 33


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 34


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.28it/s]



Processing batch 35


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]



Processing batch 36


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.24it/s]



Processing batch 37


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]


Processing batch 38


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 39


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]


Processing batch 40


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 41


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 42


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 43


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]



Processing batch 44


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 45


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 46


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 47


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 48


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 49


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 50


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 51


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 52


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]


Processing batch 53


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 54


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 55


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]


Processing batch 56


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]


Processing batch 57


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 58


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Processing batch 59


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]



Processing batch 60


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.39it/s]



Processing batch 61


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.38it/s]



Processing batch 62


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.38it/s]



Processing batch 63


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]



Processing batch 64


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 65


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 66


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 67


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 68


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 69


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 70


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 71


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 72


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.25it/s]



Processing batch 73


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.27it/s]



Processing batch 74


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.28it/s]



Processing batch 75


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]


Processing batch 76


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.33it/s]



Processing batch 77


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.33it/s]



Processing batch 78


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 79


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 80


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.33it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.33it/s]


Processing batch 81


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 82


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 83


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 84


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 85


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 86


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 87


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Processing batch 88


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 89


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 90


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.18it/s]



Processing batch 91


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]


Processing batch 92


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 93


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.33it/s]



Processing batch 94


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 95


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 96


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 97


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Processing batch 98


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 99


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.27it/s]



Processing batch 100


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 101


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 102


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.28it/s]



Processing batch 103


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 104


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]


Processing batch 105


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 106


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 107


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 108


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 109


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]



Processing batch 110


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]



Processing batch 111


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]
Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]


Processing batch 112


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.39it/s]



Processing batch 113


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]



Processing batch 114


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]



Processing batch 115


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 116


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]


Processing batch 117


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]


Processing batch 118


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]


Processing batch 119


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]



Processing batch 120


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.38it/s]



Processing batch 121


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 122


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]



Processing batch 123


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]


Processing batch 124


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]


Processing batch 125


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]


Processing batch 126


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]
Batches: 100%|██████████| 32/32 [00:22<00:00,  1.39it/s]


Processing batch 127


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.39it/s]



Processing batch 128


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.39it/s]



Processing batch 129


Batches: 100%|██████████| 32/32 [00:22<00:00,  1.40it/s]



Processing batch 130


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]



Processing batch 131


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s]



Processing batch 132


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s]



Processing batch 133


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]
Batches: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s]


Processing batch 134


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.33it/s]



Processing batch 135


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 136


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 137


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 138


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]


Processing batch 139


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 140


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 141


Batches: 100%|██████████| 32/32 [00:29<00:00,  1.07it/s]



Processing batch 142


Batches: 100%|██████████| 32/32 [00:25<00:00,  1.27it/s]



Processing batch 143


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Processing batch 144


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 145


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 146


Batches: 100%|██████████| 32/32 [00:26<00:00,  1.23it/s]



Processing batch 147


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 148


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 149


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 150


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 151


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 152


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 153


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 154


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 155


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]


Processing batch 156


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 157


Batches: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]



Processing batch 158


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 159


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 160


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 161


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]



Processing batch 162


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]


Processing batch 163


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]


Processing batch 164


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Processing batch 165


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s]



Processing batch 166


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 167


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]



Processing batch 168


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]
Batches: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Processing batch 169


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.29it/s]



Processing batch 170


Batches: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



Processing batch 171


Batches: 100%|██████████| 21/21 [00:16<00:00,  1.28it/s]

Batch Nomic embeddings added to DataFrame.





In [8]:
df

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,embedding_json,embedding
0,0.0594,1921,0.98200,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878000,...,0.6650,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954,"{""valence"": 0.0594, ""acousticness"": 0.982, ""da...","[-0.045071423, -0.1503082, -4.604198, -0.32863..."
1,0.9630,1921,0.73200,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,...,0.1600,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936,"{""valence"": 0.963, ""acousticness"": 0.732, ""dan...","[0.0073481873, -0.075731, -4.574482, -0.353133..."
2,0.0394,1921,0.96100,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913000,...,0.1010,-14.850,1,Gati Bali,5,1921,0.0339,110.339,"{""valence"": 0.0394, ""acousticness"": 0.961, ""da...","[-0.0741878, -0.15236205, -4.5514393, -0.30073..."
3,0.1650,1921,0.96700,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,0.000028,...,0.3810,-9.316,1,Danny Boy,3,1921,0.0354,100.109,"{""valence"": 0.165, ""acousticness"": 0.967, ""dan...","[-0.109299876, -0.10954094, -4.4559555, -0.339..."
4,0.2530,1921,0.95700,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,0.000002,...,0.2290,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.0380,101.665,"{""valence"": 0.253, ""acousticness"": 0.957, ""dan...","[-0.19379157, -0.11196786, -4.5607157, -0.2970..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170648,0.6080,2020,0.08460,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,...,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029,"{""valence"": 0.608, ""acousticness"": 0.0846, ""da...","[0.018091697, -0.025504714, -4.5616045, -0.336..."
170649,0.7340,2020,0.20600,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,...,0.1010,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936,"{""valence"": 0.7340000000000001, ""acousticness""...","[0.0893945, -0.13040088, -4.576856, -0.2768545..."
170650,0.6370,2020,0.10100,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,0.000009,...,0.2580,-2.226,0,AYA,76,2020-11-03,0.0809,91.688,"{""valence"": 0.637, ""acousticness"": 0.101, ""dan...","[-0.11824579, -0.08876313, -4.533818, -0.25961..."
170651,0.1950,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,0.000008,...,0.6430,-7.161,1,Darkness,70,2020-01-17,0.3080,75.055,"{""valence"": 0.195, ""acousticness"": 0.00998, ""d...","[-0.047601998, -0.18284361, -4.5687547, -0.298..."


In [9]:
df.to_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv', index=False)
print("DataFrame with embeddings saved to CSV.")

DataFrame with embeddings saved to CSV.


In [3]:
embedded_music_data = pd.read_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv')

In [None]:
embedded_music_data[:1]['embedding'].values[0]

'[-4.50714231e-02 -1.50308207e-01 -4.60419798e+00 -3.28630447e-01\n  4.81912345e-01  4.83927429e-01  9.68279481e-01  3.51697594e-01\n -7.78256595e-01 -1.31623730e-01 -5.04605293e-01  1.78278196e+00\n  3.27721477e-01  1.05301523e+00 -5.91609895e-01 -9.25122261e-01\n -1.07802105e+00 -1.05986774e+00  3.72252792e-01  6.04006767e-01\n -9.89788115e-01 -1.57479978e+00 -1.53587627e+00 -7.07414746e-01\n  2.28120327e+00  1.70042253e+00 -3.14354867e-01  4.64295805e-01\n  6.23310208e-02 -6.63552701e-01  2.13358179e-01 -6.79492354e-01\n  1.06242359e+00  1.05392241e+00 -1.54053676e+00 -3.14550787e-01\n -2.45892897e-01  8.44708264e-01  9.30151224e-01 -3.53472531e-02\n  6.07331097e-01 -7.99949467e-01  8.78421664e-01  3.80431950e-01\n  1.05354381e+00 -2.28832334e-01  1.03983951e+00 -4.74139631e-01\n -9.75247547e-02  1.64906323e-01 -2.98518240e-01 -7.84355760e-01\n  1.08925998e-01 -1.02904193e-01  2.62131095e+00  6.74720287e-01\n -8.97888839e-01  3.98063481e-01 -1.38710812e-01  7.11673021e-01\n  2.69109

In [24]:
import numpy as np

def np_str_to_list(emb):
    if isinstance(emb, str):
        # Remove brackets and split by whitespace
        emb = emb.strip('[]')
        # Split by whitespace and convert to float
        return [float(x) for x in emb.split()]
    return emb  # Already a list or array
embedded_music_data['embedding'] = embedded_music_data['embedding'].apply(np_str_to_list)

In [None]:
# from pymilvus import list_collections
# list_collections()

['embedded_music_data']

In [None]:
# from pymilvus import Collection, list_collections

# for name in list_collections():
#     print(f"Dropping collection: {name}")
#     Collection(name).drop()

Dropping collection: embedded_music_data


In [48]:
# Define metadata fields
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=64),
    FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=256),
    FieldSchema(name="artists", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="year", dtype=DataType.INT64),
    FieldSchema(name="release_date", dtype=DataType.VARCHAR, max_length=32),
    FieldSchema(name="explicit", dtype=DataType.BOOL),
    FieldSchema(name="popularity", dtype=DataType.INT64),
    FieldSchema(name="duration_ms", dtype=DataType.INT64),
    FieldSchema(name="key", dtype=DataType.INT64),
    FieldSchema(name="mode", dtype=DataType.INT64),
    FieldSchema(name="embedding_json", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(
        name="embedding",
        dtype=DataType.FLOAT_VECTOR,
        dim=768 # assumes all embeddings have same length
    ),
]

schema = CollectionSchema(fields, description="Music tracks with metadata and Nomic embeddings")

# Create the collection with 2 shards for better distribution
collection = Collection(
    name="embedded_music_data",
    schema=schema,
    using="default",
    shards_num=2
)

print("Milvus collection 'embedded_music_data' created.")

Milvus collection 'embedded_music_data' created.


In [50]:
# Load the collection
collection = Collection("embedded_music_data")

# Prepare data for insertion
batch_size = 1000
num_rows = len(embedded_music_data)

for start in range(0, num_rows, batch_size):
    end = min(start + batch_size, num_rows)
    batch = embedded_music_data.iloc[start:end]
    # Prepare data in the order of your schema
    data = [
        batch["id"].tolist(),
        batch["name"].tolist(),
        batch["artists"].tolist(),
        batch["year"].tolist(),
        batch["release_date"].tolist(),
        batch["explicit"].tolist(),
        batch["popularity"].tolist(),
        batch["duration_ms"].tolist(),
        batch["key"].tolist(),
        batch["mode"].tolist(),
        batch["embedding_json"].tolist(),
        [emb for emb in batch["embedding"]]
    ]
    collection.insert(data)
    print(f"Inserted rows {start} to {end}")

print("All data inserted into Milvus.")

Inserted rows 0 to 1000
Inserted rows 1000 to 2000
Inserted rows 2000 to 3000
Inserted rows 3000 to 4000
Inserted rows 4000 to 5000
Inserted rows 5000 to 6000
Inserted rows 6000 to 7000
Inserted rows 7000 to 8000
Inserted rows 8000 to 9000
Inserted rows 9000 to 10000
Inserted rows 10000 to 11000
Inserted rows 11000 to 12000
Inserted rows 12000 to 13000
Inserted rows 13000 to 14000
Inserted rows 14000 to 15000
Inserted rows 15000 to 16000
Inserted rows 16000 to 17000
Inserted rows 17000 to 18000
Inserted rows 18000 to 19000
Inserted rows 19000 to 20000
Inserted rows 20000 to 21000
Inserted rows 21000 to 22000
Inserted rows 22000 to 23000
Inserted rows 23000 to 24000
Inserted rows 24000 to 25000
Inserted rows 25000 to 26000
Inserted rows 26000 to 27000
Inserted rows 27000 to 28000
Inserted rows 28000 to 29000
Inserted rows 29000 to 30000
Inserted rows 30000 to 31000
Inserted rows 31000 to 32000
Inserted rows 32000 to 33000
Inserted rows 33000 to 34000
Inserted rows 34000 to 35000
Inserte

In [51]:
index_params = {
    "metric_type": "IP",  # Use Inner Product (cosine similarity) for semantic/music embeddings
    "index_type": "HNSW", # Number of bi-directional links per node (8 is a good balance of accuracy and memory for this dataset size)
    "params": {"M": 8, "efConstruction": 64} # Controls index build accuracy/speed (64 for good recall and reasonable build time)
}
collection.create_index(field_name="embedding", index_params=index_params)

Status(code=0, message=)

In [52]:
connections.disconnect(alias="default")