In [7]:
"""
Spotify Track Recommender using Direct Features (Efficient Method)
"""

# ===========================================================
# 1. Setup and Data Loading
# ===========================================================
import os
import numpy as np
import pandas as pd
# Removed torch and nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import kagglehub

# --- Load dataset ---
print("Downloading dataset...")
path = kagglehub.dataset_download("zaheenhamidani/ultimate-spotify-tracks-db")
file_path = os.path.join(path, "SpotifyFeatures.csv")

if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

df = pd.read_csv(file_path)
print(f"Data loaded successfully. Shape: {df.shape}")

Downloading dataset...
Data loaded successfully. Shape: (232725, 18)


In [8]:
# ===========================================================
# 2. Data Preprocessing
# ===========================================================

audio_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'
]

# Drop rows with missing audio features
df = df.dropna(subset=audio_features).reset_index(drop=True)

# --- Encode categorical features ---
df['mode'] = df['mode'].map({'Major': 1, 'Minor': 0})
df = pd.get_dummies(df, columns=['key', 'time_signature'], prefix=['key', 'ts'])

# --- Scale continuous features ---
continuous_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
]

scaler = StandardScaler()
scaled = pd.DataFrame(
    scaler.fit_transform(df[continuous_features]),
    columns=continuous_features,
    index=df.index
)

# --- Recombine all features ---
X = pd.concat([df.drop(columns=continuous_features), scaled], axis=1)
X['genre'] = X['genre'].replace("Children’s Music", "Children's Music")
X = pd.get_dummies(X, columns=['genre'], prefix='genre')

# --- Aggregate by track_id ---
agg_dict = {col: ('sum' if col.startswith('genre_') else 'first') for col in X.columns if col != 'track_id'}
X = X.groupby('track_id', as_index=False).agg(agg_dict)

# Drop irrelevant genres
X = X.drop(columns=["genre_Children's Music"], errors='ignore')
print(f"Preprocessing complete. Final shape: {X.shape}")

Preprocessing complete. Final shape: (176774, 57)


In [18]:
# ===========================================================
# 3. Create and Save Embeddings
# ===========================================================

# We extract only the numeric columns that will represent our "embeddings"
X_num = X.select_dtypes(include=np.number)

# We directly use the processed feature values as embeddings
emb = X_num.values

# Save the embeddings for future use
#np.save("embeddings.npy", emb)
emb = emb.astype(np.float16)
np.save("embeddings.npy", emb)

print("Embeddings (direct features) saved successfully.")
print(f"Embeddings Shape: {emb.shape}")

Embeddings (direct features) saved successfully.
Embeddings Shape: (176774, 37)


In [19]:
# ===========================================================
# 4. Recommendation Utilities
# ===========================================================
GENRE_COLS = [c for c in X.columns if c.startswith('genre_')]

def find_song_index(title, artist):
    """Finds the index of a song in the DataFrame X."""
    # Use .str.lower() for case-insensitive matching
    row = X[(X['track_name'].str.lower() == title.lower()) &
            (X['artist_name'].str.lower() == artist.lower())]

    if row.empty:
        # Try a "contains" search if the exact search fails
        row_approx = X[(X['track_name'].str.lower().str.contains(title.lower(), na=False)) &
                       (X['artist_name'].str.lower().str.contains(artist.lower(), na=False))]

        if row_approx.empty:
            raise ValueError(f"Song not found: {title} by {artist}")

        # If we find multiple matches, take the first one
        print(f"Exact match not found. Using: {row_approx.iloc[0]['track_name']} by {row_approx.iloc[0]['artist_name']}")
        return row_approx.index[0]

    return row.index[0]

def get_song_details(idx):
    """Gets formatted details for a song given its index."""
    song = X.iloc[idx]
    genres = [g.replace('genre_', '') for g in GENRE_COLS if song[g] > 0]
    return f"Title: {song['track_name']}\nArtist: {song['artist_name']}\nGenres: {', '.join(genres) or 'N/A'}"

def get_recommendations(idx, top_k=10):
    """Finds the top_k most similar songs using cosine similarity."""

    # Calculate cosine similarity between the chosen song (idx) and all others
    sims = cosine_similarity(emb[idx].reshape(1, -1), emb).flatten()

    # Get the indices of the most similar songs, excluding the song itself (which is in the first spot)
    top_idx = sims.argsort()[::-1][1:top_k+1]

    # Create a DataFrame with the results
    recs = X.iloc[top_idx].copy()
    recs['similarity'] = sims[top_idx]

    # Add genres for easy reading
    recs['genres'] = recs.apply(
        lambda r: ', '.join([g.replace('genre_', '') for g in GENRE_COLS if r[g] > 0]),
        axis=1
    )

    return recs[['track_name', 'artist_name', 'genres', 'similarity']]

In [20]:
# ===========================================================
# 5. Example Run
# ===========================================================

# --- Test 1: The Strokes ---
SONG_TITLE = "Is This It"
ARTIST_NAME = "The Strokes"

try:
    idx = find_song_index(SONG_TITLE, ARTIST_NAME)
    print("\n--- Recommending for: ---")
    print(get_song_details(idx))
    print("\n--- Top 10 Recommendations ---")
    print(get_recommendations(idx))
except Exception as e:
    print(f"Error: {e}")


--- Recommending for: ---
Title: Is This It
Artist: The Strokes
Genres: Alternative, Indie, Rock

--- Top 10 Recommendations ---
                               track_name       artist_name  \
73410                            20 Years          Bad Suns   
170111                            Someday       The Strokes   
80127                             Thrills              Cake   
126743  When It's Over - David Kahne Main         Sugar Ray   
64596                       Comfort Eagle              Cake   
112346                  Come Together Now      Matt and Kim   
25356                           City Club      The Growlers   
25890                   Para No Verte Más  La Mosca Tse-Tse   
88259                           Die Happy          DREAMERS   
159406                          Love Test      The Growlers   

                          genres  similarity  
73410                Indie, Rock    0.999696  
170111  Alternative, Indie, Rock    0.999688  
80127   Alternative, Indie, Rock   

In [21]:
# --- Test 2: Another genre (e.g., Electronic) ---
SONG_TITLE = "Around the World"
ARTIST_NAME = "Daft Punk"

try:
    idx = find_song_index(SONG_TITLE, ARTIST_NAME)
    print("\n" + "="*30)
    print("--- Recommending for: ---")
    print(get_song_details(idx))
    print("\n--- Top 10 Recommendations ---")
    print(get_recommendations(idx))
except Exception as e:
    print(f"Error: {e}")


--- Recommending for: ---
Title: Around The World
Artist: Daft Punk
Genres: Electronic

--- Top 10 Recommendations ---
                                   track_name            artist_name  \
125793                            Do It Again  The Chemical Brothers   
32700   Blue Monday - 2016 Remastered Version              New Order   
57401                                    Just                  Bicep   
167344               Mogadishu - Original Mix   Shakarchi & Stranéus   
133340       Blue Monday - 2011 Total Version              New Order   
132495                         Revolution 909              Daft Punk   
30443                           Dance with Me             Chris Lake   
176611                         Jack Come Back            Joe Goddard   
122073                                Musique              Daft Punk   
11096                         Feeling for You                Cassius   

            genres  similarity  
125793  Electronic    0.999843  
32700   Electronic   

In [22]:
# ===========================================================
# 4. Metadata saving
# ===========================================================
genre_cols = [col for col in X.columns if col.startswith("genre_")]
columns_to_keep = ['track_id', 'track_name', 'artist_name'] + genre_cols

X[columns_to_keep].to_parquet("song_metadata.parquet", index=False)
print("File 'song_metadata.parquet' saved successfully!")

File 'song_metadata.parquet' saved successfully!
