In [29]:

from dotenv import load_dotenv
import os
import os
import pandas as pd
import kagglehub
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
import numpy as np
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
import google.generativeai as genai
load_dotenv()

# Connect to Milvus
connections.connect(alias="default", host="localhost", port="19530")
print("Connected to Milvus!")

SPOTIFY_DATASET_PATH = '/Users/devyanigauri/.cache/kagglehub/datasets/yamaerenay/spotify-dataset-1921-2020-160k-tracks/versions/1/data.csv'

LYRICS_DATASET_PATH = "/Users/devyanigauri/.cache/kagglehub/datasets/nikhilnayak123/5-million-song-lyrics-dataset/versions/3/ds2.csv"



Connected to Milvus!


In [14]:
# Check if the dataset exists locally, load if it does, otherwise download

if not os.path.exists(SPOTIFY_DATASET_PATH):
    print("Spotify Dataset not found locally. Downloading from KaggleHub...")
    kagglehub.dataset_download('yamaerenay/spotify-dataset-1921-2020-160k-tracks')
else:
    print(f"SSpotify Dataset found locally at {SPOTIFY_DATASET_PATH}.")

df = pd.read_csv(SPOTIFY_DATASET_PATH, on_bad_lines='skip')
print(f"Loaded Spotify dataset with {len(df)} rows.")

SSpotify Dataset found locally at /Users/devyanigauri/.cache/kagglehub/datasets/yamaerenay/spotify-dataset-1921-2020-160k-tracks/versions/1/data.csv.
Loaded Spotify dataset with 170653 rows.


In [13]:
if not os.path.exists(LYRICS_DATASET_PATH):
    print("Lyrics Dataset not found locally. Downloading from KaggleHub...")
    path = kagglehub.dataset_download("nikhilnayak123/5-million-song-lyrics-dataset")
else:
    print(f"Lyrics Dataset found locally at {LYRICS_DATASET_PATH}.")

lyrics_df = pd.read_csv(LYRICS_DATASET_PATH, on_bad_lines='skip')
print(f"Loaded Lyrics dataset with {len(lyrics_df)} rows.")

Lyrics Dataset found locally at /Users/devyanigauri/.cache/kagglehub/datasets/nikhilnayak123/5-million-song-lyrics-dataset/versions/3/ds2.csv.
Loaded Lyrics dataset with 5913411 rows.


In [16]:
lyrics_df.columns

Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id'], dtype='object')

In [21]:
lyrics_df = lyrics_df[['title', 'artist', 'lyrics']]

KeyError: "['year'] not in index"

In [22]:
merged_df = pd.merge(df, lyrics_df, left_on='name', right_on='title', how='left')

In [23]:
len(merged_df)

4976672

In [24]:
import ast

# If 'artists' is a stringified list, convert it to a Python list
def parse_artists(artists):
    if isinstance(artists, str):
        try:
            return ast.literal_eval(artists)
        except:
            return [artists]
    return artists

merged_df['artists'] = merged_df['artists'].apply(parse_artists)

# Keep only rows where lyrics artist is in Spotify artists list
filtered_df = merged_df[merged_df.apply(lambda row: row['artist'] in row['artists'], axis=1)]

# Reset index if needed
filtered_df = filtered_df.reset_index(drop=True)

In [25]:
len(filtered_df)

53793

In [None]:
# Diagnostics: why did rows shrink? Merge vs filter vs dedup, and recoverable matches
import re, itertools
def _as_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, tuple):
        return list(x)
    if pd.isna(x) or x is None:
        return []
    # try parse stringified list
    try:
        import ast
        v = ast.literal_eval(str(x))
        if isinstance(v, (list, tuple)):
            return list(v)
    except Exception:
        pass
    return [str(x)]

def _normalize_tokens(items):
    out = []
    for s in items:
        if not isinstance(s, str):
            s = str(s)
        s = s.strip()
        # split on common separators / collabs
        parts = re.split(r"\s*(?:,|&|/| x |×|;|feat\.?|featuring|with|and)\s*", s, flags=re.IGNORECASE)
        for p in parts:
            p = p.strip().lower()
            if p:
                out.append(p)
    return out

spotify_rows = len(df) if 'df' in locals() else None
spotify_ids = df['id'].nunique() if 'df' in locals() else None
merged_rows = len(merged_df)
merged_ids = merged_df['id'].nunique()
merged_with_lyrics = merged_df['lyrics'].notna().sum() if 'lyrics' in merged_df.columns else 0
filtered_rows = len(filtered_df)
filtered_ids = filtered_df['id'].nunique()

print({
    'spotify_rows': spotify_rows,
    'spotify_unique_ids': spotify_ids,
    'merged_rows': merged_rows,
    'merged_unique_ids': merged_ids,
    'merged_with_lyrics_rows': merged_with_lyrics,
    'filtered_rows_artist_match': filtered_rows,
    'filtered_unique_ids': filtered_ids,
})

# Duplicates per id introduced by title-only merge
dup_counts = merged_df['id'].value_counts()
multi_hits = (dup_counts > 1).sum()
print(f"IDs with multiple merged rows (same Spotify id, multiple lyric candidates): {multi_hits}")
print("Sample duplicated IDs:")
print(dup_counts.head(10))

# Sample of non-matching artist rows (have lyrics but artist not in Spotify artists list)
def _artist_in_list(row):
    arts = _as_list(row.get('artists'))
    return isinstance(arts, list) and row.get('artist') in arts

nomatch_mask = merged_df['lyrics'].notna() & ~merged_df.apply(_artist_in_list, axis=1)
if nomatch_mask.any():
    print("Examples where lyrics artist didn't match Spotify artists list (exact match):")
    print(merged_df.loc[nomatch_mask, ['name','artists','artist']].head(10))
else:
    print("No exact-match mismatches found (rare)")

# Estimate recoverable matches using case-insensitive tokenized matching
def _artist_match_loose(row):
    arts = _normalize_tokens(_as_list(row.get('artists')))
    lyr = _normalize_tokens([row.get('artist', '')])
    return any(a == b for a in arts for b in lyr)

recoverable = int((merged_df['lyrics'].notna() & merged_df.apply(_artist_match_loose, axis=1)).sum())
print({'recoverable_loose_rows': recoverable})

In [27]:
filtered_df = filtered_df.drop(columns=['title', 'artist'])

In [28]:
filtered_df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'lyrics'],
      dtype='object')

In [30]:
# Splitting columns into metadata and embedding
metadata_columns = [
    'id', 'name', 'artists', 'year', 'release_date', 'explicit',
    'popularity', 'duration_ms', 'key', 'mode'
]

embedding_columns = [
    'name', 'artists', 'valence', 'acousticness', 'danceability', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo'
]


In [31]:
from sentence_transformers import SentenceTransformer

# Load nomic model for embeddings
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

<All keys matched successfully>


In [32]:
import json

# Create JSON column for embedding input
def row_to_json(row):
    return json.dumps({col: row[col] for col in embedding_columns})

filtered_df['embedding_json'] = filtered_df.apply(row_to_json, axis=1)

# Batch embedding creation
batch_size = 1000
jsons = filtered_df['embedding_json'].tolist()
embeddings = []
for i in range(0, len(jsons), batch_size):
    batch = jsons[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}")
    batch_embeds = model.encode(batch, show_progress_bar=True)
    embeddings.extend(batch_embeds)
filtered_df['embedding'] = embeddings
print("Batch Nomic embeddings added to DataFrame.")

Processing batch 1


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 2


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.15it/s]


Processing batch 3


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.18it/s]


Processing batch 4


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.17it/s]


Processing batch 5


Batches: 100%|██████████| 32/32 [00:27<00:00,  1.15it/s]


Processing batch 6


Batches: 100%|██████████| 32/32 [00:28<00:00,  1.14it/s]


Processing batch 7


Batches: 100%|██████████| 32/32 [00:28<00:00,  1.14it/s]


Processing batch 8


Batches: 100%|██████████| 32/32 [00:28<00:00,  1.12it/s]


Processing batch 9


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.04s/it]


Processing batch 10


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.06s/it]


Processing batch 11


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.08s/it]


Processing batch 12


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]


Processing batch 13


Batches: 100%|██████████| 32/32 [00:38<00:00,  1.21s/it]


Processing batch 14


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]


Processing batch 15


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.08s/it]


Processing batch 16


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Processing batch 17


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.09s/it]


Processing batch 18


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Processing batch 19


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 20


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


Processing batch 21


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.05s/it]


Processing batch 22


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 23


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 24


Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]


Processing batch 25


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.03s/it]


Processing batch 26


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 27


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.01s/it]


Processing batch 28


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 29


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 30


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.05s/it]


Processing batch 31


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Processing batch 32


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 33


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 34


Batches: 100%|██████████| 32/32 [00:31<00:00,  1.00it/s]


Processing batch 35


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Processing batch 36


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 37


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Processing batch 38


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.04it/s]


Processing batch 39


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.03it/s]


Processing batch 40


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.04it/s]


Processing batch 41


Batches: 100%|██████████| 32/32 [00:29<00:00,  1.08it/s]


Processing batch 42


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 43


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 44


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 45


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.05it/s]


Processing batch 46


Batches: 100%|██████████| 32/32 [00:29<00:00,  1.08it/s]


Processing batch 47


Batches: 100%|██████████| 32/32 [00:30<00:00,  1.06it/s]


Processing batch 48


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 49


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 50


Batches: 100%|██████████| 32/32 [00:31<00:00,  1.01it/s]


Processing batch 51


Batches: 100%|██████████| 32/32 [00:33<00:00,  1.06s/it]


Processing batch 52


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]


Processing batch 53


Batches: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Processing batch 54


Batches: 100%|██████████| 25/25 [00:25<00:00,  1.01s/it]


ValueError: Length of values (53793) does not match length of index (170653)

In [33]:
filtered_df['embedding'] = embeddings

In [34]:
filtered_df

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,loudness,mode,name,popularity,release_date,speechiness,tempo,lyrics,embedding_json,embedding
0,0.963,1921,0.73200,[Dennis Day],0.819,180533,0.3410,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,...,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936,Now Clancy was a peaceful man\nIf you know wha...,"{""name"": ""Clancy Lowered the Boom"", ""artists"":...","[-0.14313735, -0.22924717, -4.8405457, -0.9683..."
1,0.402,1923,0.99200,[Bessie Smith],0.693,167640,0.0270,0,6qRvnXftofjYJm1Mg98UWL,0.000000,...,-13.506,0,Need a Little Sugar in My Bowl,26,1923,0.0562,75.749,"Tired of bein' lonely, tired of bein' blue\nI ...","{""name"": ""Need a Little Sugar in My Bowl"", ""ar...","[-0.69976574, 0.0118034305, -4.594035, -1.2274..."
2,0.494,1923,0.99000,[Bessie Smith],0.710,169960,0.0735,0,1yjck0Owf0HfhY5kWHiXIQ,0.000000,...,-12.526,1,Me and My Gin,22,1923,0.0488,90.917,Stay away from me 'cause I'm in my sin\nStay a...,"{""name"": ""Me and My Gin"", ""artists"": [""Bessie ...","[-0.82054853, -0.86439043, -4.8381505, -1.1492..."
3,0.483,1923,0.99200,[Bessie Smith],0.587,205053,0.1750,0,0rUf1j43orpfi6f4LbVo8D,0.000318,...,-9.935,1,Careless Love,22,1923,0.0384,98.523,"Love, oh love, oh careless love\nYou've fly th...","{""name"": ""Careless Love"", ""artists"": [""Bessie ...","[-0.18622221, -0.9087218, -4.7537165, -0.95859..."
4,0.258,1923,0.99100,"[Bessie Smith, Louis Armstrong]",0.355,189600,0.1140,0,50zXyjVdFb3xAr3hnyYYn1,0.006860,...,-15.968,1,St. Louis Blues,22,1923,0.0588,69.902,I hate to see that evening sun go down\nI hate...,"{""name"": ""St. Louis Blues"", ""artists"": [""Bessi...","[-0.7889819, -0.25035557, -4.906985, -0.976880..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53788,0.466,2020,0.31000,[Fleet Foxes],0.562,253613,0.6860,0,308prODCCD0O660tIktbUi,0.022500,...,-8.480,1,Sunblind,66,2020-09-22,0.0249,103.054,[Verse 1]\nFor Richard Swift\nFor John and Bil...,"{""name"": ""Sunblind"", ""artists"": [""Fleet Foxes""...","[-0.69030887, -0.19668527, -4.8022137, -0.3032..."
53789,0.522,2020,0.20400,[Gunna],0.598,230600,0.4720,1,2f8y4CuG57UJEmkG3ujd0D,0.000015,...,-10.991,1,NASTY GIRL / ON CAMERA,66,2020-05-22,0.2580,120.080,"[Part I: ""NASTY GIRL""]\n\n[Intro]\nMmm, mmm, m...","{""name"": ""NASTY GIRL / ON CAMERA"", ""artists"": ...","[0.2002709, -0.8311799, -4.3318987, 1.2301844,..."
53790,0.734,2020,0.20600,[Ashnikko],0.717,150654,0.7530,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,...,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936,[Intro]\nHey grandpa (Grandpa)\nDrop the beat ...,"{""name"": ""Halloweenie III: Seven Days"", ""artis...","[-1.011036, 0.13105537, -4.697309, 0.13486801,..."
53791,0.637,2020,0.10100,[MAMAMOO],0.634,211280,0.8580,0,4BZXVFYCb76Q0Klojq4piV,0.000009,...,-2.226,0,AYA,76,2020-11-03,0.0809,91.688,"[마마무 ""아야 (AYA)"" 가사]\n\n[Intro: Solar]\nAya\nAy...","{""name"": ""AYA"", ""artists"": [""MAMAMOO""], ""valen...","[-0.56497407, 0.15274733, -4.365804, 0.0788590..."


In [35]:
filtered_df.to_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv', index=False)
print("DataFrame with embeddings saved to CSV.")

DataFrame with embeddings saved to CSV.


In [51]:
embedded_music_data = pd.read_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv')

In [52]:
embedded_music_data = embedded_music_data.drop_duplicates(subset=['id'], inplace=False)
print(f"After dropping duplicates, {len(embedded_music_data)} rows remain.")

After dropping duplicates, 26802 rows remain.


In [53]:
import numpy as np
import json
import ast
import re

def np_str_to_list(emb):
    # Already a list/array
    if isinstance(emb, (list, tuple, np.ndarray)):
        return list(emb)
    # Handle NaN/None
    if emb is None:
        return []
    try:
        if pd.isna(emb):
            return []
    except Exception:
        pass

    s = str(emb).strip()

    # Try JSON first (e.g., "[0.1, -0.2, 0.3]")
    try:
        parsed = json.loads(s)
        if isinstance(parsed, list):
            return [float(x) for x in parsed]
    except Exception:
        pass

    # Try Python literal (e.g., "[-0.1, 0.2]" or "(-0.1, 0.2)")
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, (list, tuple, np.ndarray)):
            return [float(x) for x in parsed]
    except Exception:
        pass

    # Fallback: strip brackets and split on commas/whitespace
    s = s.strip('[]()')
    parts = re.split(r'[\s,]+', s)
    parts = [p for p in parts if p]
    return [float(p) for p in parts]

embedded_music_data['embedding'] = embedded_music_data['embedding'].apply(np_str_to_list)

In [54]:
# Select features for clustering (use only those present)
feature_candidates = ['valence', 'energy', 'danceability', 'acousticness', 'tempo', 'liveness', 'instrumentalness', 'speechiness']
features = [c for c in feature_candidates if c in embedded_music_data.columns]
X = embedded_music_data[features].astype(float).fillna(0.0)

# Scale features
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

# Model selection via BIC on a small grid of K; prefer 'diag', upgrade to 'full' only if it clearly wins
candidate_ns = [6, 8, 10, 12, 14, 16]
best_bic = np.inf
best_gmm = None
best_n = None
best_cov = None
BIC_ADVANTAGE = 10.0  # threshold for a clear win

for n in candidate_ns:
    # diag first
    gmm_diag = GaussianMixture(n_components=n, covariance_type='diag', reg_covar=1e-5, random_state=42)
    gmm_diag.fit(Xs)
    bic_diag = gmm_diag.bic(Xs)

    chosen_gmm = gmm_diag
    chosen_bic = bic_diag
    chosen_cov = 'diag'

    # try full, adopt only if clearly better
    try:
        gmm_full = GaussianMixture(n_components=n, covariance_type='full', reg_covar=1e-5, random_state=42)
        gmm_full.fit(Xs)
        bic_full = gmm_full.bic(Xs)
        if bic_full + BIC_ADVANTAGE < bic_diag:
            chosen_gmm = gmm_full
            chosen_bic = bic_full
            chosen_cov = 'full'
    except Exception as e:
        # ignore full failures due to runtime or numerical issues
        pass

    if chosen_bic < best_bic:
        best_bic = chosen_bic
        best_gmm = chosen_gmm
        best_n = n
        best_cov = chosen_cov

print(f"Selected GMM: n_components={best_n}, covariance_type={best_cov}, BIC={best_bic:.2f}")

# Soft assignments
probs = best_gmm.predict_proba(Xs)
clusters_hard = probs.argmax(axis=1)
clusters_conf = probs.max(axis=1)

embedded_music_data['vibe_cluster'] = clusters_hard
embedded_music_data['vibe_conf'] = clusters_conf
embedded_music_data['blended'] = embedded_music_data['vibe_conf'] < 0.45

# Profile clusters in original feature space (using hard assignments)
cluster_profiles = {}
for k in range(best_n):
    mask = embedded_music_data['vibe_cluster'] == k
    if mask.any():
        cluster_profiles[k] = embedded_music_data.loc[mask, features].mean().to_dict()
    else:
        cluster_profiles[k] = {f: float('nan') for f in features}

# Helper to format a profile into a short text description
def profile_to_text(profile: dict) -> str:
    ordered = ['valence','energy','danceability','acousticness','tempo','liveness','instrumentalness','speechiness']
    parts = []
    for f in ordered:
        if f in profile:
            v = float(profile[f]) if profile[f] == profile[f] else 0.0  # handle NaN
            if f == 'tempo':
                parts.append(f"tempo ~ {v:.0f} BPM")
            else:
                qual = 'low' if v < 0.33 else ('medium' if v < 0.66 else 'high')
                parts.append(f"{f}: {qual} ({v:.2f})")
    return ", ".join(parts)



Selected GMM: n_components=16, covariance_type=full, BIC=261849.47


In [60]:
# Ensure Ollama is running and a local model is available (pull if missing)
import os, time, requests
os.environ["OLLAMA_MODEL"] = "qwen2.5:3b-instruct"
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
PREFERRED_MODELS = ["qwen2.5:3b-instruct"]

def ollama_up(base: str) -> bool:
    try:
        r = requests.get(f"{base}/api/tags", timeout=3)
        return r.status_code == 200
    except Exception:
        return False

def list_models(base: str):
    try:
        r = requests.get(f"{base}/api/tags", timeout=10)
        r.raise_for_status()
        data = r.json() or {}
        return [m.get("name") for m in data.get("models", [])]
    except Exception:
        return []

def has_model(name: str, base: str) -> bool:
    names = list_models(base)
    return any(n and n.lower() == name.lower() for n in names)

def pull_model(name: str, base: str) -> bool:
    try:
        # Stream pull for progress; tolerate if stream isn't supported
        with requests.post(f"{base}/api/pull", json={"model": name, "stream": True}, stream=True, timeout=600) as r:
            r.raise_for_status()
            for line in r.iter_lines():
                if not line:
                    continue
                try:
                    obj = json.loads(line.decode("utf-8"))
                except Exception:
                    continue
                status = obj.get("status", "")
                if "success" in status.lower() or obj.get("completed"):
                    return True
        # Fallback check if model appears after pull
        return has_model(name, base)
    except Exception:
        return False

selected_model = None
if not ollama_up(OLLAMA_HOST):
    print(f"Ollama server not reachable at {OLLAMA_HOST}. Start it in a terminal: 'ollama serve'")
else:
    for m in PREFERRED_MODELS:
        if has_model(m, OLLAMA_HOST):
            selected_model = m
            break
        if pull_model(m, OLLAMA_HOST):
            selected_model = m
            break
    if selected_model:
        os.environ["OLLAMA_MODEL"] = selected_model
        print(f"Ollama ready. Using model: {selected_model}")
    else:
        print("Could not pull a preferred model automatically. Pull one manually, e.g.: 'ollama pull llama3.2:3b-instruct'")

Ollama ready. Using model: qwen2.5:3b-instruct


In [64]:
# Name clusters with a local LLM via Ollama (no external rate limits). Minimal rules; aim for balanced label usage.
# New approach:
#  - Single prompt with all cluster profiles; model assigns labels from the canonical list.
#  - Ask it to use each label at least once and distribute labels as evenly as possible across clusters while respecting features.
#  - If some clusters are missing or some labels remain unused, fill per-cluster via a tiny prompt, then round-robin as last resort.
vibe_names = {}
try:
    import difflib, requests, os, json, re, random

    # Canonical label set (your preferred labels) — 'Good Vibes' removed per request
    canonical_labels = [
        "Chill", "Pop", "Dance", "Acoustic", "Upbeat", "Groove",
        "Lofi", "Soft Haze", "Pump Up", "Midnight Blues"
    ]

    allowed_set = set(canonical_labels)

    # Lightweight canonicalization to keep outputs in the allowed set if wording drifts slightly.
    # Map 'good vibes' synonyms to 'Upbeat' now that 'Good Vibes' is removed.
    synonym_map = {
        "lo-fi": "Lofi",
        "lo fi": "Lofi",
        "good vibes": "Upbeat",
        "feel good": "Upbeat",
        "uplifting": "Upbeat",
        "pump up": "Pump Up",
        "midnight": "Midnight Blues",
        "blues": "Midnight Blues",
    }

    def _strip_punct_lower(s: str) -> str:
        s = s.lower().replace("_", " ").replace("-", " ")
        s = re.sub(r"[^a-z0-9\s]", " ", s)
        return " ".join(s.split())

    def canonicalize(label: str) -> str:
        if not isinstance(label, str) or not label.strip():
            return ""
        raw = label.strip()
        norm = _strip_punct_lower(raw)
        if norm in synonym_map:
            mapped = synonym_map[norm]
            return mapped if mapped in allowed_set else ""
        # exact match
        for c in canonical_labels:
            if norm == c.lower():
                return c
        # contains any canonical token
        for c in canonical_labels:
            if c.lower() in norm:
                return c
        # fuzzy
        match = difflib.get_close_matches(raw, canonical_labels, n=1, cutoff=0.7)
        return match[0] if match else ""

    # Model selection: prefer env model or any installed preferred model.
    base = os.getenv("OLLAMA_HOST", "http://localhost:11434")

    def list_models(base: str):
        try:
            r = requests.get(f"{base}/api/tags", timeout=10)
            r.raise_for_status()
            data = r.json() or {}
            return [m.get("name") for m in data.get("models", []) if m.get("name")]
        except Exception:
            return []

    installed = list_models(base)
    env_model = os.getenv("OLLAMA_MODEL")
    preferred = [
        "qwen2.5:3b-instruct", "llama3.2:3b-instruct", "llama3.2:instruct",
        "phi3:mini-4k-instruct", "phi3:mini", "mistral:7b-instruct"
    ]
    model_candidates = []
    if env_model:
        model_candidates.append(env_model)
    for pref in preferred:
        for name in installed:
            if name.lower() == pref.lower() or name.lower().startswith(pref.lower()):
                if name not in model_candidates:
                    model_candidates.append(name)
    if not model_candidates and installed:
        model_candidates = [installed[0]]

    def ollama_generate(prompt: str, model: str, expect_json: bool = False) -> str:
        url = f"{base}/api/generate"
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {"temperature": 0.3, "num_predict": 256}
        }
        if expect_json:
            payload["format"] = "json"
        r = requests.post(url, json=payload, timeout=90)
        r.raise_for_status()
        data = r.json()
        return (data.get("response") or "").strip()

    def extract_json(text: str):
        # Try direct json
        try:
            return json.loads(text)
        except Exception:
            pass
        # Try to extract from code fences
        m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text)
        if m:
            try:
                return json.loads(m.group(1))
            except Exception:
                pass
        m = re.search(r"(\{[\s\S]*\})", text)
        if m:
            try:
                return json.loads(m.group(1))
            except Exception:
                pass
        return None

    # Build cluster descriptions
    cluster_desc = []
    all_ids = []
    for k, prof in cluster_profiles.items():
        feat = profile_to_text(prof)
        mask = embedded_music_data['vibe_cluster'] == k
        size = int(mask.sum())
        avg_conf = float(embedded_music_data.loc[mask, 'vibe_conf'].mean()) if size > 0 else 0.0
        cluster_desc.append({"id": int(k), "features": feat, "size": size, "avg_conf": round(avg_conf, 3)})
        all_ids.append(int(k))
    all_ids = sorted(all_ids)

    allowed_list_str = ", ".join(canonical_labels)
    clusters_json = json.dumps(cluster_desc)
    template_map = {str(cid): "" for cid in all_ids}

    # First pass: assign labels to each cluster id aiming for even coverage
    system_prompt = (
        "You assign vibe labels to music clusters based on feature summaries. "
        "Use ONLY these exact labels: [" + allowed_list_str + "]. "
        "Assign exactly one label per cluster id. "
        "Distribute labels as evenly as possible across all clusters while keeping choices plausible for the features. "
        "You MUST include ALL of these cluster ids: " + ", ".join(map(str, all_ids)) + ". "
        "Fill this JSON template with labels as values (do not change keys):\n" + json.dumps(template_map) + "\n"
        "Return ONLY a JSON object mapping cluster id (int-as-string) -> label (string)."
    )

    def label_all_clusters(model: str) -> dict:
        prompt = system_prompt + "\nClusters: " + clusters_json + "\nJSON mapping:"
        text = ollama_generate(prompt, model, expect_json=True)
        obj = extract_json(text) or {}
        if not isinstance(obj, dict):
            return {}
        out = {}
        for k, v in obj.items():
            # keys expected as strings of ints
            try:
                cid = int(k)
            except Exception:
                continue
            lab = canonicalize(v)
            if lab in allowed_set:
                out[cid] = lab
        return out

    def choose_label_for_cluster(model: str, entry: dict) -> str:
        # Tiny prompt per cluster to fill any missing
        prompt = (
            "Pick the single best fitting label from this allowed list: [" + allowed_list_str + "].\n"
            "Return ONLY the label (exact casing).\n"
            f"Cluster id: {entry['id']}\nFeatures: {entry['features']}\nLabel:"
        )
        text = ollama_generate(prompt, model, expect_json=False)
        return canonicalize(text)

    mapping = {}
    raw_first_pass_ok = False
    if not model_candidates:
        print("No local Ollama models detected. We'll assign neutral 'Vibe k' names. Install one, e.g., 'ollama pull qwen2.5:3b-instruct'.")
    else:
        for m in model_candidates:
            try:
                mapping = label_all_clusters(m)
                if mapping and len(mapping) == len(all_ids):
                    raw_first_pass_ok = True
                    break
            except Exception:
                mapping = {}
                continue

    # Fill any missing cluster ids via per-cluster prompt; fallback to round-robin of least-used labels
    missing_ids = [cid for cid in all_ids if cid not in mapping]
    if missing_ids and model_candidates:
        # Try per-cluster LLM assignment
        for cid in missing_ids:
            entry = next((e for e in cluster_desc if e['id'] == cid), None)
            if not entry:
                continue
            for m in model_candidates:
                try:
                    lab = choose_label_for_cluster(m, entry)
                    if lab in allowed_set:
                        mapping[cid] = lab
                        break
                except Exception:
                    continue
    # If still missing, do round-robin on least-used labels
    missing_ids = [cid for cid in all_ids if cid not in mapping]
    if missing_ids:
        counts = {lab: 0 for lab in canonical_labels}
        for lab in mapping.values():
            if lab in counts:
                counts[lab] += 1
        labels_by_need = sorted(canonical_labels, key=lambda L: (counts[L], random.random()))
        i = 0
        for cid in missing_ids:
            mapping[cid] = labels_by_need[i % len(labels_by_need)]
            i += 1

    # Coverage check: ensure every label appears at least once across clusters
    used = set(mapping.values())
    unused = [lab for lab in canonical_labels if lab not in used]
    if unused and model_candidates:
        # Try minimal adjustments: pick clusters with most common labels and re-ask per cluster which fits better among {current, missing}
        # To keep it simple, just reassign a few clusters with highest counts to unused labels
        counts = {lab: 0 for lab in canonical_labels}
        for lab in mapping.values():
            counts[lab] = counts.get(lab, 0) + 1
        # Sort clusters by descending label frequency to pick donors
        donors = sorted(all_ids, key=lambda cid: counts.get(mapping.get(cid, ''), 0), reverse=True)
        di = 0
        for lab in unused:
            if di >= len(donors):
                break
            cid = donors[di]
            entry = next((e for e in cluster_desc if e['id'] == cid), None)
            # Ask the model to choose between current and the missing label
            for m in model_candidates:
                try:
                    prompt = (
                        "You must choose the better fitting label for this cluster from these TWO options only.\n"
                        f"Options: [{mapping[cid]}, {lab}]\n"
                        "Return ONLY one of the two options with exact casing.\n"
                        f"Cluster id: {cid}\nFeatures: {entry['features']}\nLabel:"
                    )
                    text = ollama_generate(prompt, m, expect_json=False)
                    chosen = canonicalize(text)
                    if chosen in {mapping[cid], lab}:
                        mapping[cid] = chosen
                        break
                except Exception:
                    continue
            # If the model didn't help, still assign the unused label to ensure coverage
            if lab not in set(mapping.values()):
                mapping[cid] = lab
            di += 1

    # Finalize vibe_names for all clusters
    canonical_count = 0
    for k in cluster_profiles.keys():
        name = mapping.get(int(k), "")
        if name in allowed_set:
            canonical_count += 1
            vibe_names[k] = name
        else:
            vibe_names[k] = f"Vibe {k}"

    if canonical_count == 0:
        raise RuntimeError(
            "No canonical vibe labels were produced. Ensure 'ollama serve' is running and a supported model is installed (e.g., 'ollama pull qwen2.5:3b-instruct')."
        )
except Exception as e:
    print(f"Local LLM naming skipped or failed: {e}")
    for k in cluster_profiles.keys():
        vibe_names[k] = f"Vibe {k}"

# Assign final vibe, marking low-confidence as Blended
mapped_vibe = embedded_music_data['vibe_cluster'].map(vibe_names)
embedded_music_data['vibe'] = np.where(embedded_music_data['blended'], 'Blended', mapped_vibe)
print("Vibe clusters:", vibe_names)
num_canonical = sum(1 for v in vibe_names.values() if v in set(canonical_labels)) if 'canonical_labels' in locals() else 0
print({"clusters_total": len(vibe_names), "clusters_canonical": num_canonical, "clusters_neutral": len(vibe_names) - num_canonical})

Vibe clusters: {0: 'Chill', 1: 'Pump Up', 2: 'Dance', 3: 'Midnight Blues', 4: 'Lofi', 5: 'Acoustic', 6: 'Upbeat', 7: 'Soft Haze', 8: 'Pump Up', 9: 'Chill', 10: 'Pop', 11: 'Acoustic', 12: 'Dance', 13: 'Midnight Blues', 14: 'Groove', 15: 'Acoustic'}
{'clusters_total': 16, 'clusters_canonical': 16, 'clusters_neutral': 0}


In [65]:
embedded_music_data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,release_date,speechiness,tempo,lyrics,embedding_json,embedding,vibe_cluster,vibe_conf,blended,vibe
0,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,...,1921,0.415,60.936,Now Clancy was a peaceful man\nIf you know wha...,"{""name"": ""Clancy Lowered the Boom"", ""artists"":...","[-0.143137351, -0.229247168, -4.84054565, -0.9...",13,0.885227,False,Midnight Blues
1,0.402,1923,0.992,['Bessie Smith'],0.693,167640,0.027,0,6qRvnXftofjYJm1Mg98UWL,0.0,...,1923,0.0562,75.749,"Tired of bein' lonely, tired of bein' blue\nI ...","{""name"": ""Need a Little Sugar in My Bowl"", ""ar...","[-0.699765742, 0.0118034305, -4.59403515, -1.2...",12,0.870942,False,Dance
2,0.494,1923,0.99,['Bessie Smith'],0.71,169960,0.0735,0,1yjck0Owf0HfhY5kWHiXIQ,0.0,...,1923,0.0488,90.917,Stay away from me 'cause I'm in my sin\nStay a...,"{""name"": ""Me and My Gin"", ""artists"": [""Bessie ...","[-0.820548534, -0.864390433, -4.8381505, -1.14...",12,0.976631,False,Dance
3,0.483,1923,0.992,['Bessie Smith'],0.587,205053,0.175,0,0rUf1j43orpfi6f4LbVo8D,0.000318,...,1923,0.0384,98.523,"Love, oh love, oh careless love\nYou've fly th...","{""name"": ""Careless Love"", ""artists"": [""Bessie ...","[-0.186222211, -0.908721805, -4.75371647, -0.9...",4,0.741036,False,Lofi
4,0.258,1923,0.991,"['Bessie Smith', 'Louis Armstrong']",0.355,189600,0.114,0,50zXyjVdFb3xAr3hnyYYn1,0.00686,...,1923,0.0588,69.902,I hate to see that evening sun go down\nI hate...,"{""name"": ""St. Louis Blues"", ""artists"": [""Bessi...","[-0.788981915, -0.250355572, -4.90698481, -0.9...",9,0.898343,False,Chill


In [66]:
# Vibe label distribution (after labeling and dedup, before Milvus insertion)
from collections import OrderedDict

# Canonical labels (Good Vibes removed)
canonical_labels = [
    "Chill", "Pop", "Dance", "Acoustic", "Upbeat", "Groove",
    "Lofi", "Soft Haze", "Pump Up", "Midnight Blues"
]

vc = embedded_music_data['vibe'].fillna('').astype(str)
total = len(vc)

counts = OrderedDict()
for lab in canonical_labels:
    counts[lab] = int((vc == lab).sum())
counts['Blended'] = int((vc == 'Blended').sum())

print(f"Total tracks: {total}")
for k, v in sorted(counts.items(), key=lambda kv: kv[1], reverse=True):
    pct = (100.0 * v / total) if total else 0.0
    print(f"{k:20s}: {v:6d}  ({pct:5.1f}%)")

Total tracks: 26802
Dance               :   8133  ( 30.3%)
Acoustic            :   3215  ( 12.0%)
Midnight Blues      :   3058  ( 11.4%)
Pump Up             :   2983  ( 11.1%)
Pop                 :   2655  (  9.9%)
Chill               :   2116  (  7.9%)
Lofi                :   1547  (  5.8%)
Upbeat              :   1084  (  4.0%)
Groove              :   1005  (  3.7%)
Soft Haze           :    638  (  2.4%)
Blended             :    368  (  1.4%)


In [67]:
# Persist back to CSV so downstream ingestion includes 'vibe'
embedded_music_data.to_csv('/Users/devyanigauri/Documents/GitHub/rhythmodoro/src/spotify_dataset_with_embeddings.csv', index=False)
print("Saved CSV with 'vibe', 'vibe_cluster', and 'vibe_conf' columns.")

Saved CSV with 'vibe', 'vibe_cluster', and 'vibe_conf' columns.


In [68]:
len(embedded_music_data)

26802

In [73]:
embedded_music_data[:1]['embedding'].values[0]

[-0.143137351,
 -0.229247168,
 -4.84054565,
 -0.968335271,
 0.524305642,
 0.728669226,
 0.871309638,
 -0.00974745117,
 -0.156411305,
 -0.443167061,
 -0.640500605,
 1.41033804,
 0.149887279,
 1.41451502,
 1.16565764,
 -1.39578736,
 -1.36437023,
 -2.05990171,
 0.538945615,
 0.581114948,
 -0.229293153,
 -1.79353511,
 -1.72583795,
 -0.670438826,
 2.33432293,
 0.383269131,
 0.0277688112,
 -0.130079582,
 -0.0562638566,
 0.408605903,
 0.219170481,
 0.418112129,
 -0.780590475,
 -0.506763816,
 -0.831357062,
 0.0463779345,
 -0.216171831,
 0.917917013,
 0.396962315,
 0.366175532,
 1.21814239,
 0.19726634,
 0.0773748308,
 -0.208258241,
 -0.228960991,
 -1.34889233,
 0.346865088,
 0.740144312,
 -0.536022067,
 -0.788160741,
 0.154958323,
 -0.995409131,
 -0.0028208266,
 -0.811453402,
 1.61176145,
 0.656850398,
 -0.0377004817,
 -0.343861073,
 -0.464827538,
 0.426975578,
 0.759593308,
 0.507536829,
 0.358966202,
 1.20514977,
 1.35536849,
 -1.13232756,
 -0.180443853,
 1.55379629,
 0.540129602,
 -0.317142

In [74]:
# from pymilvus import list_collections
# list_collections()

In [75]:
from pymilvus import Collection, list_collections

for name in list_collections():
    print(f"Dropping collection: {name}")
    Collection(name).drop()

In [76]:
# Define metadata fields
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=64),
    FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=256),
    FieldSchema(name="artists", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="year", dtype=DataType.INT64),
    FieldSchema(name="release_date", dtype=DataType.VARCHAR, max_length=32),
    FieldSchema(name="explicit", dtype=DataType.BOOL),
    FieldSchema(name="popularity", dtype=DataType.INT64),
    FieldSchema(name="duration_ms", dtype=DataType.INT64),
    FieldSchema(name="key", dtype=DataType.INT64),
    FieldSchema(name="mode", dtype=DataType.INT64),
    FieldSchema(name="vibe", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="embedding_json", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(
        name="embedding",
        dtype=DataType.FLOAT_VECTOR,
        dim=768 # assumes all embeddings have same length
    ),
]

schema = CollectionSchema(fields, description="Music tracks with metadata, vibe labels, and Nomic embeddings")

# Create the collection with 2 shards for better distribution
collection = Collection(
    name="embedded_music_data",
    schema=schema,
    using="default",
    shards_num=2
)

print("Milvus collection 'embedded_music_data' created.")

Milvus collection 'embedded_music_data' created.


In [77]:
# Load the collection
collection = Collection("embedded_music_data")

# Prepare data for insertion
batch_size = 1000
num_rows = len(embedded_music_data)

for start in range(0, num_rows, batch_size):
    end = min(start + batch_size, num_rows)
    batch = embedded_music_data.iloc[start:end]
    # Stringify artists for VARCHAR field
    artists_str = batch["artists"].apply(lambda x: ", ".join(x) if isinstance(x, (list, tuple)) else ("" if pd.isna(x) else str(x)))
    # Prepare data in the order of your schema
    data = [
        batch["id"].tolist(),
        batch["name"].tolist(),
        artists_str.tolist(),
        batch["year"].tolist(),
        batch["release_date"].tolist(),
        batch["explicit"].tolist(),
        batch["popularity"].tolist(),
        batch["duration_ms"].tolist(),
        batch["key"].tolist(),
        batch["mode"].tolist(),
        batch["vibe"].fillna("").astype(str).tolist(),
        batch["embedding_json"].tolist(),
        [emb for emb in batch["embedding"]]
    ]
    collection.insert(data)
    print(f"Inserted rows {start} to {end}")

print("All data inserted into Milvus.")

Inserted rows 0 to 1000
Inserted rows 1000 to 2000
Inserted rows 2000 to 3000
Inserted rows 1000 to 2000
Inserted rows 2000 to 3000
Inserted rows 3000 to 4000
Inserted rows 3000 to 4000
Inserted rows 4000 to 5000
Inserted rows 5000 to 6000
Inserted rows 4000 to 5000
Inserted rows 5000 to 6000
Inserted rows 6000 to 7000
Inserted rows 6000 to 7000
Inserted rows 7000 to 8000
Inserted rows 8000 to 9000
Inserted rows 7000 to 8000
Inserted rows 8000 to 9000
Inserted rows 9000 to 10000
Inserted rows 10000 to 11000
Inserted rows 9000 to 10000
Inserted rows 10000 to 11000
Inserted rows 11000 to 12000
Inserted rows 11000 to 12000
Inserted rows 12000 to 13000
Inserted rows 12000 to 13000
Inserted rows 13000 to 14000
Inserted rows 13000 to 14000
Inserted rows 14000 to 15000
Inserted rows 15000 to 16000
Inserted rows 14000 to 15000
Inserted rows 15000 to 16000
Inserted rows 16000 to 17000
Inserted rows 17000 to 18000
Inserted rows 16000 to 17000
Inserted rows 17000 to 18000
Inserted rows 18000 to 1

In [78]:
# index_params = {
#     "metric_type": "IP",  # Use Inner Product (cosine similarity) for semantic/music embeddings
#     "index_type": "HNSW", # Number of bi-directional links per node (8 is a good balance of accuracy and memory for this dataset size)
#     "params": {"M": 8, "efConstruction": 64} # Controls index build accuracy/speed (64 for good recall and reasonable build time)
# }
# collection.create_index(field_name="embedding", index_params=index_params)

In [79]:
new_index_params = {
    "metric_type": "L2",  # or "IP", "COSINE", etc.
    "index_type": "HNSW",
    "params": {"M": 16, "efConstruction": 128}
}
collection.create_index(field_name="embedding", index_params=new_index_params)

Status(code=0, message=)

In [80]:
connections.disconnect(alias="default")