In [59]:

from dotenv import load_dotenv
import os
import os
import pandas as pd
import kagglehub
import numpy as np
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

load_dotenv()

# Connect to Milvus
connections.connect(alias="default", host="localhost", port="19530")
print("Connected to Milvus!")

SPOTIFY_DATASET_PATH = '/Users/devyanigauri/.cache/kagglehub/datasets/yamaerenay/spotify-dataset-1921-2020-160k-tracks/versions/1/data.csv'

LYRICS_DATASET_PATH = "/Users/devyanigauri/.cache/kagglehub/datasets/nikhilnayak123/5-million-song-lyrics-dataset/versions/3/ds2.csv"



Connected to Milvus!


In [14]:
# Check if the dataset exists locally, load if it does, otherwise download

if not os.path.exists(SPOTIFY_DATASET_PATH):
    print("Spotify Dataset not found locally. Downloading from KaggleHub...")
    kagglehub.dataset_download('yamaerenay/spotify-dataset-1921-2020-160k-tracks')
else:
    print(f"SSpotify Dataset found locally at {SPOTIFY_DATASET_PATH}.")

df = pd.read_csv(SPOTIFY_DATASET_PATH, on_bad_lines='skip')
print(f"Loaded Spotify dataset with {len(df)} rows.")

SSpotify Dataset found locally at /Users/devyanigauri/.cache/kagglehub/datasets/yamaerenay/spotify-dataset-1921-2020-160k-tracks/versions/1/data.csv.
Loaded Spotify dataset with 170653 rows.


In [13]:
if not os.path.exists(LYRICS_DATASET_PATH):
    print("Lyrics Dataset not found locally. Downloading from KaggleHub...")
    path = kagglehub.dataset_download("nikhilnayak123/5-million-song-lyrics-dataset")
else:
    print(f"Lyrics Dataset found locally at {LYRICS_DATASET_PATH}.")

lyrics_df = pd.read_csv(LYRICS_DATASET_PATH, on_bad_lines='skip')
print(f"Loaded Lyrics dataset with {len(lyrics_df)} rows.")

Lyrics Dataset found locally at /Users/devyanigauri/.cache/kagglehub/datasets/nikhilnayak123/5-million-song-lyrics-dataset/versions/3/ds2.csv.
Loaded Lyrics dataset with 5913411 rows.


In [16]:
lyrics_df.columns

Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id'], dtype='object')

In [21]:
lyrics_df = lyrics_df[['title', 'artist', 'lyrics']]

KeyError: "['year'] not in index"

In [22]:
merged_df = pd.merge(df, lyrics_df, left_on='name', right_on='title', how='left')

In [23]:
len(merged_df)

4976672

In [24]:
import ast

# If 'artists' is a stringified list, convert it to a Python list
def parse_artists(artists):
    if isinstance(artists, str):
        try:
            return ast.literal_eval(artists)
        except:
            return [artists]
    return artists

merged_df['artists'] = merged_df['artists'].apply(parse_artists)

# Keep only rows where lyrics artist is in Spotify artists list
filtered_df = merged_df[merged_df.apply(lambda row: row['artist'] in row['artists'], axis=1)]

# Reset index if needed
filtered_df = filtered_df.reset_index(drop=True)

In [25]:
len(filtered_df)

53793

In [27]:
filtered_df = filtered_df.drop(columns=['title', 'artist'])

In [28]:
filtered_df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'lyrics'],
      dtype='object')

In [30]:
# Splitting columns into metadata and embedding
metadata_columns = [
    'id', 'name', 'artists', 'year', 'release_date', 'explicit',
    'popularity', 'duration_ms', 'key', 'mode'
]

embedding_columns = [
    'name', 'artists', 'valence', 'acousticness', 'danceability', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo'
]


In [31]:
from sentence_transformers import SentenceTransformer

# Load nomic model for embeddings
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

<All keys matched successfully>


In [32]:
import json

# Create JSON column for embedding input
def row_to_json(row):
    return json.dumps({col: row[col] for col in embedding_columns})

filtered_df['embedding_json'] = filtered_df.apply(row_to_json, axis=1)

# Batch embedding creation
batch_size = 1000
jsons = filtered_df['embedding_json'].tolist()
embeddings = []
for i in range(0, len(jsons), batch_size):
    batch = jsons[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}")
    batch_embeds = model.encode(batch, show_progress_bar=True)
    embeddings.extend(batch_embeds)
filtered_df['embedding'] = embeddings
print("Batch Nomic embeddings added to DataFrame.")

Processing batch 1


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 2


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.15it/s]


Processing batch 3


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.18it/s]


Processing batch 4


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.17it/s]


Processing batch 5


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.15it/s]


Processing batch 6


Batches: 100%|██████████| 32/32 [00:28<00:00,  1.14it/s]


Processing batch 7


Batches: 100%|██████████| 32/32 [00:28<00:00,  1.14it/s]


Processing batch 8


Batches: 100%|██████████| 32/32 [00:28<00:00,  1.12it/s]


Processing batch 9


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.04s/it]


Processing batch 10


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.06s/it]


Processing batch 11


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.08s/it]


Processing batch 12


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]


Processing batch 13


Batches: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]


Processing batch 14


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]


Processing batch 15


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.08s/it]


Processing batch 16


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Processing batch 17


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.09s/it]


Processing batch 18


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Processing batch 19


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 20


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Processing batch 21


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.05s/it]


Processing batch 22


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 23


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 24


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 25


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.03s/it]


Processing batch 26


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 27


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.01s/it]


Processing batch 28


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 29


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 30


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.05s/it]


Processing batch 31


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Processing batch 32


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 33


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 34


Batches: 100%|██████████| 32/32 [00:31<00:00,  1.00it/s]


Processing batch 35


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Processing batch 36


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 37


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Processing batch 38


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.04it/s]


Processing batch 39


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.03it/s]


Processing batch 40


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.04it/s]


Processing batch 41


Batches: 100%|██████████| 32/32 [00:29<00:00,  1.08it/s]


Processing batch 42


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 43


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 44


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 45


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 46


Batches: 100%|██████████| 32/32 [00:29<00:00,  1.08it/s]


Processing batch 47


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.06it/s]


Processing batch 48


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 49


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 50


Batches: 100%|██████████| 32/32 [00:31<00:00,  1.01it/s]


Processing batch 51


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.06s/it]


Processing batch 52


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 53


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 54


Batches: 100%|██████████| 25/25 [00:25<00:00,  1.01s/it]


ValueError: Length of values (53793) does not match length of index (170653)

In [33]:
filtered_df['embedding'] = embeddings

In [34]:
filtered_df

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,loudness,mode,name,popularity,release_date,speechiness,tempo,lyrics,embedding_json,embedding
0,0.963,1921,0.73200,[Dennis Day],0.819,180533,0.3410,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,...,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936,Now Clancy was a peaceful man\nIf you know wha...,"{""name"": ""Clancy Lowered the Boom"", ""artists"":...","[-0.14313735, -0.22924717, -4.8405457, -0.9683..."
1,0.402,1923,0.99200,[Bessie Smith],0.693,167640,0.0270,0,6qRvnXftofjYJm1Mg98UWL,0.000000,...,-13.506,0,Need a Little Sugar in My Bowl,26,1923,0.0562,75.749,"Tired of bein' lonely, tired of bein' blue\nI ...","{""name"": ""Need a Little Sugar in My Bowl"", ""ar...","[-0.69976574, 0.0118034305, -4.594035, -1.2274..."
2,0.494,1923,0.99000,[Bessie Smith],0.710,169960,0.0735,0,1yjck0Owf0HfhY5kWHiXIQ,0.000000,...,-12.526,1,Me and My Gin,22,1923,0.0488,90.917,Stay away from me 'cause I'm in my sin\nStay a...,"{""name"": ""Me and My Gin"", ""artists"": [""Bessie ...","[-0.82054853, -0.86439043, -4.8381505, -1.1492..."
3,0.483,1923,0.99200,[Bessie Smith],0.587,205053,0.1750,0,0rUf1j43orpfi6f4LbVo8D,0.000318,...,-9.935,1,Careless Love,22,1923,0.0384,98.523,"Love, oh love, oh careless love\nYou've fly th...","{""name"": ""Careless Love"", ""artists"": [""Bessie ...","[-0.18622221, -0.9087218, -4.7537165, -0.95859..."
4,0.258,1923,0.99100,"[Bessie Smith, Louis Armstrong]",0.355,189600,0.1140,0,50zXyjVdFb3xAr3hnyYYn1,0.006860,...,-15.968,1,St. Louis Blues,22,1923,0.0588,69.902,I hate to see that evening sun go down\nI hate...,"{""name"": ""St. Louis Blues"", ""artists"": [""Bessi...","[-0.7889819, -0.25035557, -4.906985, -0.976880..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53788,0.466,2020,0.31000,[Fleet Foxes],0.562,253613,0.6860,0,308prODCCD0O660tIktbUi,0.022500,...,-8.480,1,Sunblind,66,2020-09-22,0.0249,103.054,[Verse 1]\nFor Richard Swift\nFor John and Bil...,"{""name"": ""Sunblind"", ""artists"": [""Fleet Foxes""...","[-0.69030887, -0.19668527, -4.8022137, -0.3032..."
53789,0.522,2020,0.20400,[Gunna],0.598,230600,0.4720,1,2f8y4CuG57UJEmkG3ujd0D,0.000015,...,-10.991,1,NASTY GIRL / ON CAMERA,66,2020-05-22,0.2580,120.080,"[Part I: ""NASTY GIRL""]\n\n[Intro]\nMmm, mmm, m...","{""name"": ""NASTY GIRL / ON CAMERA"", ""artists"": ...","[0.2002709, -0.8311799, -4.3318987, 1.2301844,..."
53790,0.734,2020,0.20600,[Ashnikko],0.717,150654,0.7530,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,...,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936,[Intro]\nHey grandpa (Grandpa)\nDrop the beat ...,"{""name"": ""Halloweenie III: Seven Days"", ""artis...","[-1.011036, 0.13105537, -4.697309, 0.13486801,..."
53791,0.637,2020,0.10100,[MAMAMOO],0.634,211280,0.8580,0,4BZXVFYCb76Q0Klojq4piV,0.000009,...,-2.226,0,AYA,76,2020-11-03,0.0809,91.688,"[마마무 ""아야 (AYA)"" 가사]\n\n[Intro: Solar]\nAya\nAy...","{""name"": ""AYA"", ""artists"": [""MAMAMOO""], ""valen...","[-0.56497407, 0.15274733, -4.365804, 0.0788590..."


In [35]:
filtered_df.to_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv', index=False)
print("DataFrame with embeddings saved to CSV.")

DataFrame with embeddings saved to CSV.


In [36]:
embedded_music_data = pd.read_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv')

In [51]:
embedded_music_data = embedded_music_data.drop_duplicates(subset=['id'], inplace=False)
print(f"After dropping duplicates, {len(embedded_music_data)} rows remain.")

After dropping duplicates, 53482 rows remain.


In [52]:
len(embedded_music_data)

53482

In [53]:
embedded_music_data[:1]['embedding'].values[0]

[-0.143137351,
 -0.229247168,
 -4.84054565,
 -0.968335271,
 0.524305642,
 0.728669226,
 0.871309638,
 -0.00974745117,
 -0.156411305,
 -0.443167061,
 -0.640500605,
 1.41033804,
 0.149887279,
 1.41451502,
 1.16565764,
 -1.39578736,
 -1.36437023,
 -2.05990171,
 0.538945615,
 0.581114948,
 -0.229293153,
 -1.79353511,
 -1.72583795,
 -0.670438826,
 2.33432293,
 0.383269131,
 0.0277688112,
 -0.130079582,
 -0.0562638566,
 0.408605903,
 0.219170481,
 0.418112129,
 -0.780590475,
 -0.506763816,
 -0.831357062,
 0.0463779345,
 -0.216171831,
 0.917917013,
 0.396962315,
 0.366175532,
 1.21814239,
 0.19726634,
 0.0773748308,
 -0.208258241,
 -0.228960991,
 -1.34889233,
 0.346865088,
 0.740144312,
 -0.536022067,
 -0.788160741,
 0.154958323,
 -0.995409131,
 -0.0028208266,
 -0.811453402,
 1.61176145,
 0.656850398,
 -0.0377004817,
 -0.343861073,
 -0.464827538,
 0.426975578,
 0.759593308,
 0.507536829,
 0.358966202,
 1.20514977,
 1.35536849,
 -1.13232756,
 -0.180443853,
 1.55379629,
 0.540129602,
 -0.317142

In [54]:
import numpy as np

def np_str_to_list(emb):
    if isinstance(emb, str):
        # Remove brackets and split by whitespace
        emb = emb.strip('[]')
        # Split by whitespace and convert to float
        return [float(x) for x in emb.split()]
    return emb  # Already a list or array
embedded_music_data['embedding'] = embedded_music_data['embedding'].apply(np_str_to_list)

In [None]:
# from pymilvus import list_collections
# list_collections()

['embedded_music_data']

In [60]:
from pymilvus import Collection, list_collections

for name in list_collections():
    print(f"Dropping collection: {name}")
    Collection(name).drop()

Dropping collection: embedded_music_data


In [61]:
# Define metadata fields
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=64),
    FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=256),
    FieldSchema(name="artists", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="year", dtype=DataType.INT64),
    FieldSchema(name="release_date", dtype=DataType.VARCHAR, max_length=32),
    FieldSchema(name="explicit", dtype=DataType.BOOL),
    FieldSchema(name="popularity", dtype=DataType.INT64),
    FieldSchema(name="duration_ms", dtype=DataType.INT64),
    FieldSchema(name="key", dtype=DataType.INT64),
    FieldSchema(name="mode", dtype=DataType.INT64),
    FieldSchema(name="embedding_json", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(
        name="embedding",
        dtype=DataType.FLOAT_VECTOR,
        dim=768 # assumes all embeddings have same length
    ),
]

schema = CollectionSchema(fields, description="Music tracks with metadata and Nomic embeddings")

# Create the collection with 2 shards for better distribution
collection = Collection(
    name="embedded_music_data",
    schema=schema,
    using="default",
    shards_num=2
)

print("Milvus collection 'embedded_music_data' created.")

Milvus collection 'embedded_music_data' created.


In [62]:
# Load the collection
collection = Collection("embedded_music_data")

# Prepare data for insertion
batch_size = 1000
num_rows = len(embedded_music_data)

for start in range(0, num_rows, batch_size):
    end = min(start + batch_size, num_rows)
    batch = embedded_music_data.iloc[start:end]
    # Prepare data in the order of your schema
    data = [
        batch["id"].tolist(),
        batch["name"].tolist(),
        batch["artists"].tolist(),
        batch["year"].tolist(),
        batch["release_date"].tolist(),
        batch["explicit"].tolist(),
        batch["popularity"].tolist(),
        batch["duration_ms"].tolist(),
        batch["key"].tolist(),
        batch["mode"].tolist(),
        batch["embedding_json"].tolist(),
        [emb for emb in batch["embedding"]]
    ]
    collection.insert(data)
    print(f"Inserted rows {start} to {end}")

print("All data inserted into Milvus.")

Inserted rows 0 to 1000
Inserted rows 1000 to 2000
Inserted rows 2000 to 3000
Inserted rows 3000 to 4000
Inserted rows 4000 to 5000
Inserted rows 5000 to 6000
Inserted rows 6000 to 7000
Inserted rows 7000 to 8000
Inserted rows 4000 to 5000
Inserted rows 5000 to 6000
Inserted rows 6000 to 7000
Inserted rows 7000 to 8000
Inserted rows 8000 to 9000
Inserted rows 9000 to 10000
Inserted rows 10000 to 11000
Inserted rows 11000 to 12000
Inserted rows 8000 to 9000
Inserted rows 9000 to 10000
Inserted rows 10000 to 11000
Inserted rows 11000 to 12000
Inserted rows 12000 to 13000
Inserted rows 13000 to 14000
Inserted rows 14000 to 15000
Inserted rows 12000 to 13000
Inserted rows 13000 to 14000
Inserted rows 14000 to 15000
Inserted rows 15000 to 16000
Inserted rows 16000 to 17000
Inserted rows 17000 to 18000
Inserted rows 15000 to 16000
Inserted rows 16000 to 17000
Inserted rows 17000 to 18000
Inserted rows 18000 to 19000
Inserted rows 19000 to 20000
Inserted rows 20000 to 21000
Inserted rows 1800

In [63]:
index_params = {
    "metric_type": "IP",  # Use Inner Product (cosine similarity) for semantic/music embeddings
    "index_type": "HNSW", # Number of bi-directional links per node (8 is a good balance of accuracy and memory for this dataset size)
    "params": {"M": 8, "efConstruction": 64} # Controls index build accuracy/speed (64 for good recall and reasonable build time)
}
collection.create_index(field_name="embedding", index_params=index_params)

Status(code=0, message=)

In [64]:
connections.disconnect(alias="default")