In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import spotipy
import json
from spotipy.oauth2 import SpotifyOAuth
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
from sklearn.metrics.pairwise import cosine_similarity

## Data Preparation

Item representation: each row of the matrix represents a unique item. In this case, the name of the song.

Feature Representation: each column represents a specific feature associated with each item. Relating it back to our scope of music, this can be features like danceability, energy, tempo, etc.

Vector Representation: represents each item as a vector in a high-dimensional space, where the dimensions correspond to the features.

To conduct a cosine similarity measure of the user's playlist and the 1M song database, it is essential that the number and arrangement of features align.

### Tracks Vector

In [2]:
df = pd.read_csv('/Users/silvialee/Downloads/spotify recommendation system/spotify_data.csv')

# drop unnecessary columns
df = df.drop(['Unnamed: 0', 'key', 'duration_ms', 'time_signature'], axis = 1)
df

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,-10.058,1,0.0429,0.6940,0.000000,0.1150,0.1390,133.406
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,-10.286,1,0.0258,0.4770,0.000014,0.0974,0.5150,140.182
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,-13.711,1,0.0323,0.3380,0.000050,0.0895,0.1450,139.832
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,-9.845,1,0.0363,0.8070,0.000000,0.0797,0.5080,204.961
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.430,0.791,-5.419,0,0.0302,0.0726,0.019300,0.1100,0.2170,171.864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159759,Nicola Conte,Black Spirits,0m27F0IGHLGAWhqd6ccYst,4,2011,trip-hop,0.373,0.742,-6.453,0,0.0736,0.3250,0.000141,0.1590,0.5220,107.951
1159760,Nicola Conte,Quiet Dawn,6er9p611eHEcUCU50j7D57,3,2011,trip-hop,0.516,0.675,-7.588,0,0.0326,0.7880,0.000129,0.1300,0.2640,119.897
1159761,Amon Tobin,Morning Ms Candis,7jsMMqxy1tt0rH5FzYcZTQ,2,2011,trip-hop,0.491,0.440,-8.512,1,0.0274,0.4770,0.003130,0.0936,0.0351,100.076
1159762,Peace Orchestra,Happy Christmas (War Is Over),77lA1InUaXztuRk2vOzD1S,0,2011,trip-hop,0.480,0.405,-13.343,1,0.0276,0.4310,0.000063,0.1250,0.2020,133.885


### Playlist Vector

In [3]:
# sets up the Spotify API credentials
client_id = '1203d1ca68574060af904a20d8e120e3'
client_secret = '192ac71cb664471ab93496735217448c'
redirect_uri = 'http://localhost:3000'

# Initializes the Spotipy (library for the Spotify Web API). 
# SpotifyOAuth is used to authenticate and authorize access to Spotify’s data.
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id, client_secret, redirect_uri))


In [4]:
# Extract playlist ID from a Spotify playlist link
playlist_link = 'https://open.spotify.com/playlist/37i9dQZF1DX5Q5wA1hY6bS?si=e43b100b9c734ba3'
playlist_id = '37i9dQZF1DX5Q5wA1hY6bS'

# Retrieves the first 50 songs from that playlist using the Spotify API
playlist_tracks = sp.playlist_tracks(playlist_id, limit=50)

In [5]:
# List to store data
titles, artists, uri = [], [], []

# Iterate through each song in the playlist  extract song details
for item in playlist_tracks['items']:       # items is a list containing dictionaries. Each dictionary represents one track and its data. Can't customize bc it's a fixed API structure within Spotify
    track = item['track']           # 'track' now holds a dictionary with various details about the song, such as its name, artists, album, and more
    titles.append(track['name'])    # Extract and Store the Track Name --> track['artists'] = [{'name': 'Artist A'}, {'name': 'Artist B'}]
    artist_names = ', '.join([artist['name'] for artist in track['artists']])   # list comprehension per song ['Artist A', 'Artist B'] -->  extract values associated with name from artist dictionary above
    artists.append(artist_names)    # Extract and Store Artist Names
    uri.append(track['uri'])        # Extract and Store the Track URI


In [6]:
# Create a DataFrame for the Playlist to store song details that we extracted above
data = {'Title': titles, 'Artist': artists, 'uri': uri}
playlist = pd.DataFrame(data)

In [7]:
# Initializes new columns in the DataFrame and fill them with placeholders
# We will fill them with various audio features per track that will be fetched from Spotify's API on the next step
new_feat = ['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
for item in new_feat:
    playlist[item] = 0


In [8]:
for i in range(len(playlist)):
    track_uri = playlist.iloc[i].uri
    audio_features = sp.audio_features(track_uri)
    json_string = json.dumps(audio_features[0])
    dictionary = json.loads(json_string)
    
    #update feature values
    for feature in new_feat:
        playlist.loc[i, feature] = dictionary[feature]

In [9]:
playlist

Unnamed: 0,Title,Artist,uri,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Can’t Catch Me Now - from The Hunger Games: Th...,Olivia Rodrigo,spotify:track:56xHMIfQPoe0prrSi3BGhf,0.409,0.347,-8.159,0,0.0359,0.834,5e-06,0.121,0.199,141.332
1,Letting Go,Angie McMahon,spotify:track:2XHznZZIWLkh7xO3WQAjpp,0.485,0.753,-6.652,1,0.0548,0.443,0.00912,0.121,0.601,170.043
2,Evergreen,Richy Mitch & The Coal Miners,spotify:track:6me7F0aaZjwDo6RJ5MrfBD,0.555,0.216,-11.661,1,0.0721,0.557,0.00416,0.109,0.504,79.109
3,The Bird Song,Noah Floersch,spotify:track:5DEpOV9no5cf22c5Lj198g,0.453,0.546,-5.477,1,0.0469,0.767,0.0,0.0814,0.654,116.053
4,You’re Gonna Go Far,Noah Kahan,spotify:track:4nHJcUtNSUVjXRnjdP29Bk,0.586,0.358,-9.761,1,0.0308,0.556,0.0,0.11,0.365,169.925
5,Work Song,Hozier,spotify:track:5TgEJ62DOzBpGxZ7WRsrqb,0.531,0.363,-7.672,1,0.0608,0.749,0.0,0.112,0.259,121.412
6,Alaska,Maggie Rogers,spotify:track:4HfLQJtVT1KiX1eVedDyTm,0.847,0.386,-10.668,0,0.0507,0.411,0.00176,0.108,0.178,104.001
7,Anchor,Novo Amor,spotify:track:7qH9Z4dJEN0l9bidizW7fq,0.457,0.407,-11.475,1,0.0308,0.805,0.884,0.126,0.126,117.053
8,Beige,Yoke Lore,spotify:track:7uYQELhe7g6QLIzZDOlhbW,0.434,0.677,-7.535,1,0.119,0.384,0.0684,0.119,0.218,167.923
9,Maine,"hey, nothing",spotify:track:3pf3hHjBJP6E4zAaSDrl8r,0.52,0.508,-5.501,1,0.0369,0.0234,2e-05,0.144,0.191,168.015


## Data Cleaning

### 1. Tracks Vector

1. Using Multi-Hot Encoding to Represent Genres

In order to create an item-feature matrix to use the cosine similarity algorithm, all column types must be of numerical value. So, I would need to convert genre string values into integer values. Multi-hot-encoding is used to represent categorical data as binary vectors (0 and 1)

In [10]:
# convert values in genre feature as binomial features
# Use pd.get_dummies() to automatically convert the 'genre' column into binomial features
df = pd.get_dummies(df, columns=['genre'], prefix='genre')


# Drop any unnecessary columns if needed (you've already dropped 'genre', but other columns might not be required)

# drop genre column in feat_vec df
#df.drop('genre', axis=1, inplace=True)    

2. Capture Periodic Columns 
See temporal patterns and reduce dimensionality

In [11]:
# Make columns for each time period as binomial
df['year_2000-2004'] = df['year'].apply(lambda 
                                        year: 1 if year>=2000 and year<2005 else 0)
df['year_2005-2009'] = df['year'].apply(lambda 
                                        year: 1 if year>=2005 and year<2009 else 0)
df['year_2010-2014'] = df['year'].apply(lambda 
                                        year: 1 if year>=2010 and year<2014 else 0)
df['year_2015-2019'] = df['year'].apply(lambda 
                                        year: 1 if year>=2015 and year<2019 else 0)
df['year_2020-2023'] = df['year'].apply(lambda 
                                        year: 1 if year>=2020 and year<2023 else 0)

# Drop year column, no longer needed
df = df.drop(columns=['year'])

3. Standardize other columns into 0-100 scale

popularity scale: 1-100, 
loudness scale: -60-0, 
tempo scale: 0-250, 

In [12]:
# add min and max values for each row to establish min and max values, then once scaling is done, remove min and max columns
min_row = {'popularity': '0', 'loudness': '-60', 'tempo': '0'}
max_row = {'popularity': '100', 'loudness': '0', 'tempo': '250'}

min_row_df = pd.DataFrame([min_row])
max_row_df = pd.DataFrame([max_row])

In [13]:
df = pd.concat([df, min_row_df], ignore_index=True)
df = pd.concat([df, max_row_df], ignore_index=True)

df

Unnamed: 0,artist_name,track_name,track_id,popularity,danceability,energy,loudness,mode,speechiness,acousticness,...,genre_swedish,genre_tango,genre_techno,genre_trance,genre_trip-hop,year_2000-2004,year_2005-2009,year_2010-2014,year_2015-2019,year_2020-2023
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,0.483,0.303,-10.058,1.0,0.0429,0.6940,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,0.572,0.454,-10.286,1.0,0.0258,0.4770,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,0.409,0.234,-13.711,1.0,0.0323,0.3380,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,0.392,0.251,-9.845,1.0,0.0363,0.8070,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,0.430,0.791,-5.419,0.0,0.0302,0.0726,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159761,Amon Tobin,Morning Ms Candis,7jsMMqxy1tt0rH5FzYcZTQ,2,0.491,0.440,-8.512,1.0,0.0274,0.4770,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0
1159762,Peace Orchestra,Happy Christmas (War Is Over),77lA1InUaXztuRk2vOzD1S,0,0.480,0.405,-13.343,1.0,0.0276,0.4310,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0
1159763,Mo' Horizons,Hit the Road Jack (Pé Na Éstrada),4oMiOwhDZEdBuzAfhzRHbi,3,0.782,0.861,-7.292,0.0,0.1250,0.2200,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0
1159764,,,,0,,,-60,,,,...,,,,,,,,,,


In [14]:
scale = ['popularity', 'loudness', 'tempo']
scaler = MinMaxScaler()
df[scale] = scaler.fit_transform(df[scale])

# drop min and max values
df = df.iloc[:-2]

df


Unnamed: 0,artist_name,track_name,track_id,popularity,danceability,energy,loudness,mode,speechiness,acousticness,...,genre_swedish,genre_tango,genre_techno,genre_trance,genre_trip-hop,year_2000-2004,year_2005-2009,year_2010-2014,year_2015-2019,year_2020-2023
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,0.68,0.483,0.303,0.754730,1.0,0.0429,0.6940,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,0.50,0.572,0.454,0.751285,1.0,0.0258,0.4770,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,0.57,0.409,0.234,0.699525,1.0,0.0323,0.3380,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,0.58,0.392,0.251,0.757949,1.0,0.0363,0.8070,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,0.54,0.430,0.791,0.824835,0.0,0.0302,0.0726,...,False,False,False,False,False,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159759,Nicola Conte,Black Spirits,0m27F0IGHLGAWhqd6ccYst,0.04,0.373,0.742,0.809209,0.0,0.0736,0.3250,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0
1159760,Nicola Conte,Quiet Dawn,6er9p611eHEcUCU50j7D57,0.03,0.516,0.675,0.792057,0.0,0.0326,0.7880,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0
1159761,Amon Tobin,Morning Ms Candis,7jsMMqxy1tt0rH5FzYcZTQ,0.02,0.491,0.440,0.778093,1.0,0.0274,0.4770,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0
1159762,Peace Orchestra,Happy Christmas (War Is Over),77lA1InUaXztuRk2vOzD1S,0.00,0.480,0.405,0.705087,1.0,0.0276,0.4310,...,False,False,False,False,True,0.0,0.0,1.0,0.0,0.0


### 2. Playlist Vector

The spotify api doesn't have genres for each song and we need genres feature to perform cosine similarity. However, Spotify provides genres for each artist. So, we should use task parallelization.

In [15]:
# Create a list of artist names from the playlist
artist_names = playlist['Artist'].tolist()

# Create an empty list to store genres
genres = []

# Parallelization process function to iterate through artist names and retrieve genres
# take an artist's name and a Spotipy client and return a string of genres associated with that artist
def process_artist(artist_name, sp):
    search_results = sp.search(q=artist_name, type='artist')
    genres_info = []

    # # if search results contain artists, iterate through list of returned artists
    if 'artists' in search_results and 'items' in search_results['artists']:
        artists = search_results['artists']['items']

        # artist with name matching artist_name is found, genres for that artist are extracted, 
        # which retrieves the genres key from artist dictionary or returns empty list if key doesn't exist 
        for artist in artists:
            if artist['name'] == artist_name:
                genres_info = artist.get('genres', [])
                break
    
    # convert genre list to string
    genre_string = ', '.join(genres_info) if genres_info else 'No Genre Found'
    return genre_string

# Fill genre for each song using Parallelization (distributing execution of process_artist function across multiple CPUs)
# n_jobs=-1 -- use all available CPUs to maximize utilization of system's processing power
# delayed(process_artist) pass process_artist function and its arguments (artist_name, sp) to parallel without immediately executing it
# create function for each artist name in artist_names, allowing joblib to execute them in parallel
# store results into genres list
genres = Parallel(n_jobs=-1)(delayed(process_artist)(artist_name, sp) for artist_name in artist_names)

# update playlist dataframe
playlist['Genre'] = genres

playlist

Unnamed: 0,Title,Artist,uri,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Genre
0,Can’t Catch Me Now - from The Hunger Games: Th...,Olivia Rodrigo,spotify:track:56xHMIfQPoe0prrSi3BGhf,0.409,0.347,-8.159,0,0.0359,0.834,5e-06,0.121,0.199,141.332,pop
1,Letting Go,Angie McMahon,spotify:track:2XHznZZIWLkh7xO3WQAjpp,0.485,0.753,-6.652,1,0.0548,0.443,0.00912,0.121,0.601,170.043,australian indie
2,Evergreen,Richy Mitch & The Coal Miners,spotify:track:6me7F0aaZjwDo6RJ5MrfBD,0.555,0.216,-11.661,1,0.0721,0.557,0.00416,0.109,0.504,79.109,modern folk rock
3,The Bird Song,Noah Floersch,spotify:track:5DEpOV9no5cf22c5Lj198g,0.453,0.546,-5.477,1,0.0469,0.767,0.0,0.0814,0.654,116.053,No Genre Found
4,You’re Gonna Go Far,Noah Kahan,spotify:track:4nHJcUtNSUVjXRnjdP29Bk,0.586,0.358,-9.761,1,0.0308,0.556,0.0,0.11,0.365,169.925,"pov: indie, singer-songwriter pop"
5,Work Song,Hozier,spotify:track:5TgEJ62DOzBpGxZ7WRsrqb,0.531,0.363,-7.672,1,0.0608,0.749,0.0,0.112,0.259,121.412,"irish singer-songwriter, modern rock, pop, pov..."
6,Alaska,Maggie Rogers,spotify:track:4HfLQJtVT1KiX1eVedDyTm,0.847,0.386,-10.668,0,0.0507,0.411,0.00176,0.108,0.178,104.001,indie pop
7,Anchor,Novo Amor,spotify:track:7qH9Z4dJEN0l9bidizW7fq,0.457,0.407,-11.475,1,0.0308,0.805,0.884,0.126,0.126,117.053,"ambient folk, indie folk"
8,Beige,Yoke Lore,spotify:track:7uYQELhe7g6QLIzZDOlhbW,0.434,0.677,-7.535,1,0.119,0.384,0.0684,0.119,0.218,167.923,nyc pop
9,Maine,"hey, nothing",spotify:track:3pf3hHjBJP6E4zAaSDrl8r,0.52,0.508,-5.501,1,0.0369,0.0234,2e-05,0.144,0.191,168.015,modern indie folk


In [16]:
# Ensure the Genre column has been updated with genres
if 'Genre' in playlist.columns:
    # Split genre strings into individual genres
    genre_list = list(set([genre.strip() for sublist in playlist['Genre'].str.split(', ') for genre in sublist]))

    # Apply one-hot encoding for genres only
    for genre in genre_list:
        playlist['genre_' + genre] = playlist['Genre'].apply(lambda x: 1 if genre in x else 0)

    # Drop the original 'Genre' column since it is now encoded into binary columns
    playlist.drop(columns=['Genre'], inplace=True)

playlist


Unnamed: 0,Title,Artist,uri,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,genre_swedish country,genre_irish singer-songwriter,genre_canadian singer-songwriter,genre_israeli indie,genre_stomp and holler,genre_ectofolk,genre_anti-folk,genre_nyc pop,genre_modern indie folk,genre_modern folk rock
0,Can’t Catch Me Now - from The Hunger Games: Th...,Olivia Rodrigo,spotify:track:56xHMIfQPoe0prrSi3BGhf,0.409,0.347,-8.159,0,0.0359,0.834,5e-06,...,0,0,0,0,0,0,0,0,0,0
1,Letting Go,Angie McMahon,spotify:track:2XHznZZIWLkh7xO3WQAjpp,0.485,0.753,-6.652,1,0.0548,0.443,0.00912,...,0,0,0,0,0,0,0,0,0,0
2,Evergreen,Richy Mitch & The Coal Miners,spotify:track:6me7F0aaZjwDo6RJ5MrfBD,0.555,0.216,-11.661,1,0.0721,0.557,0.00416,...,0,0,0,0,0,0,0,0,0,1
3,The Bird Song,Noah Floersch,spotify:track:5DEpOV9no5cf22c5Lj198g,0.453,0.546,-5.477,1,0.0469,0.767,0.0,...,0,0,0,0,0,0,0,0,0,0
4,You’re Gonna Go Far,Noah Kahan,spotify:track:4nHJcUtNSUVjXRnjdP29Bk,0.586,0.358,-9.761,1,0.0308,0.556,0.0,...,0,0,0,0,0,0,0,0,0,0
5,Work Song,Hozier,spotify:track:5TgEJ62DOzBpGxZ7WRsrqb,0.531,0.363,-7.672,1,0.0608,0.749,0.0,...,0,1,0,0,0,0,0,0,0,0
6,Alaska,Maggie Rogers,spotify:track:4HfLQJtVT1KiX1eVedDyTm,0.847,0.386,-10.668,0,0.0507,0.411,0.00176,...,0,0,0,0,0,0,0,0,0,0
7,Anchor,Novo Amor,spotify:track:7qH9Z4dJEN0l9bidizW7fq,0.457,0.407,-11.475,1,0.0308,0.805,0.884,...,0,0,0,0,0,0,0,0,0,0
8,Beige,Yoke Lore,spotify:track:7uYQELhe7g6QLIzZDOlhbW,0.434,0.677,-7.535,1,0.119,0.384,0.0684,...,0,0,0,0,0,0,0,1,0,0
9,Maine,"hey, nothing",spotify:track:3pf3hHjBJP6E4zAaSDrl8r,0.52,0.508,-5.501,1,0.0369,0.0234,2e-05,...,0,0,0,0,0,0,0,0,1,0


## Modeling: Cosine Similarity Based Recommendation System

In [17]:
# sort the dataframes in alphabetical order so columns correspond to each other for the cosine similarity algorithm
playlist = playlist.sort_index(axis=1)
feat_vec = df.sort_index(axis=1)

# for cosine similarity, drop track_id column of the dataframe, this is not needed and numerical values are only needed
feat_vec_cosine_sim = feat_vec.drop('track_id', axis=1)

# drop the Artist, Title, and uri in the playlist dataframe as well since they are not numerical values  
columns_dropped = ['Artist', 'Title', 'uri']
playlist_cosine_sim = playlist.drop(columns_dropped, axis=1)

# Calculate column averages of the playlist dataframe
column_averages = playlist_cosine_sim.mean()

# Create a new DataFrame for the averages and totals
averages_cosine_sim = pd.DataFrame([column_averages], index=['Average'])

averages_cosine_sim

Unnamed: 0,acousticness,danceability,energy,genre_No Genre Found,genre_acoustic pop,genre_ambient folk,genre_anti-folk,genre_australian indie,genre_british singer-songwriter,genre_canadian folk,...,genre_stomp and holler,genre_swedish americana,genre_swedish country,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence
Average,0.357257,0.54992,0.54844,0.14,0.06,0.02,0.02,0.02,0.04,0.02,...,0.32,0.02,0.02,0.055832,0.136246,-8.20688,0.88,0.042618,118.72084,0.42308


In [27]:
# Step 1: Find common numeric columns between feat_vec and averages_cosine_sim
common_columns = feat_vec.columns.intersection(averages_cosine_sim.columns)

# Step 2: Ensure the columns are numeric (filter out non-numeric columns like 'track_name', etc.)
numeric_columns = feat_vec[common_columns].select_dtypes(include=[float, int]).columns

# Step 3: Select the common numeric columns from both DataFrames for cosine similarity
feat_vec_numeric = feat_vec[numeric_columns]
averages_cosine_sim_numeric = averages_cosine_sim[numeric_columns]

# Step 4: Calculate cosine similarity using the aligned numeric columns
similarity_scores = cosine_similarity(feat_vec_numeric, averages_cosine_sim_numeric)

# Step 5: Assign similarity scores back to the original DataFrame
feat_vec['similarity_score'] = similarity_scores

# Step 6: Sort DataFrame by similarity score (highest to lowest)
top_similarities = feat_vec.sort_values(by='similarity_score', ascending=False)

# Step 7: Remove rows where track IDs match with playlist's track IDs to avoid duplicate recommendations
top_similarities = top_similarities[~top_similarities['track_id'].isin(playlist['uri'])]

# Calculate top 3 genres based on the sum of each genre column
genre_columns = [col for col in feat_vec.columns if col.startswith('genre_')]
top_3_genres = feat_vec[genre_columns].sum().sort_values(ascending=False).index[:3].str.replace('genre_', '')

print("Top 3 genres:", top_3_genres)

# Step 8: Get song recommendations from top 3 genres
first_genre = top_similarities.loc[top_similarities['genre_' + top_3_genres[0]] == 1].head(45)
second_genre = top_similarities.loc[top_similarities['genre_' + top_3_genres[1]] == 1].head(30)
third_genre = top_similarities.loc[top_similarities['genre_' + top_3_genres[2]] == 1].head(15)

# Step 9: Concatenate the top recommendations from the top 3 genres
top_similarities = pd.concat([first_genre, second_genre, third_genre], ignore_index=True)

# Final output of top recommendations
top_similarities


Top 3 genres: Index(['black-metal', 'gospel', 'ambient'], dtype='object')


Unnamed: 0,acousticness,artist_name,danceability,energy,genre_acoustic,genre_afrobeat,genre_alt-rock,genre_ambient,genre_black-metal,genre_blues,...,tempo,track_id,track_name,valence,year_2000-2004,year_2005-2009,year_2010-2014,year_2015-2019,year_2020-2023,similarity_score
0,0.2130,Thy Catafalque,0.0593,0.00755,False,False,False,False,True,False,...,0.823428,2A3NFyIp3FkSjEcXuVfBJp,Fehérvasárnap,0.0522,0.0,0.0,0.0,1.0,0.0,0.820421
1,0.2480,Oak Pantheon,0.2460,0.24000,False,False,False,False,True,False,...,0.831660,7uPzXjScTTfh9itLUDjgw0,A Prayer for Light,0.0528,0.0,0.0,0.0,1.0,0.0,0.687773
2,0.1420,Abgott,0.1600,0.17500,False,False,False,False,True,False,...,0.808808,7F2iSF9AMWbghugjtPJEki,Book 7: Thy Evocation Cthulhu (Parts I-IV),0.0368,1.0,0.0,0.0,0.0,0.0,0.620213
3,0.4920,Cough,0.2630,0.31900,False,False,False,False,True,False,...,0.812260,5ECnbsi09ARREGPnpErQIi,Still They Pray,0.1660,0.0,0.0,0.0,1.0,0.0,0.614556
4,0.2220,Lord Agheros,0.1220,0.32100,False,False,False,False,True,False,...,0.701304,1UxEnCypCF6BnSlofRWdIb,The Last Forsaken,0.0385,0.0,0.0,1.0,0.0,0.0,0.598482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.3690,Boards of Canada,0.1730,0.13200,False,False,False,True,False,False,...,0.803484,0fTRKXymeRaQW2tfg0MZNz,Diving Station,0.0653,1.0,0.0,0.0,0.0,0.0,0.553772
86,0.2760,Hexentanz,0.0788,0.28500,False,False,False,True,False,False,...,0.702592,2yJNCawVvAPkB8Cpo6svS0,Charivari,0.0382,0.0,1.0,0.0,0.0,0.0,0.544638
87,0.3590,Mokadelic,0.1320,0.50300,False,False,False,True,False,False,...,0.880284,5bnmND3EJjBGXs3QECuxMy,Tragic Vodka,0.0391,0.0,0.0,0.0,0.0,0.0,0.542572
88,0.7650,Suso Saiz,0.1420,0.12000,False,False,False,True,False,False,...,0.887240,0dR9uUA5NTCguMiNUiNXQS,Smoke,0.0563,0.0,0.0,0.0,0.0,1.0,0.537371


In [30]:
# Inspect actual column names first:
print(top_similarities.columns)

# Example: adjust based on actual column names
display_features = ['track_name', 'artist_name', 'similarity_score']  # Adjust 'genre' and 'preview_url' based on actual names

# Filter the DataFrame to show only these columns
playlist_recs = top_similarities[display_features]

# Convert similarity_score to percentage and round to 2 decimal places
playlist_recs['similarity_score'] = (playlist_recs['similarity_score'] * 100).round(2)

# Display the recommendations
playlist_recs


Index(['acousticness', 'artist_name', 'danceability', 'energy',
       'genre_acoustic', 'genre_afrobeat', 'genre_alt-rock', 'genre_ambient',
       'genre_black-metal', 'genre_blues',
       ...
       'tempo', 'track_id', 'track_name', 'valence', 'year_2000-2004',
       'year_2005-2009', 'year_2010-2014', 'year_2015-2019', 'year_2020-2023',
       'similarity_score'],
      dtype='object', length=102)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlist_recs['similarity_score'] = (playlist_recs['similarity_score'] * 100).round(2)


Unnamed: 0,track_name,artist_name,similarity_score
0,Fehérvasárnap,Thy Catafalque,82.04
1,A Prayer for Light,Oak Pantheon,68.78
2,Book 7: Thy Evocation Cthulhu (Parts I-IV),Abgott,62.02
3,Still They Pray,Cough,61.46
4,The Last Forsaken,Lord Agheros,59.85
...,...,...,...
85,Diving Station,Boards of Canada,55.38
86,Charivari,Hexentanz,54.46
87,Tragic Vodka,Mokadelic,54.26
88,Smoke,Suso Saiz,53.74
