# Dependencies

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re 

import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials

from dotenv import load_dotenv
import os

### Spotify Setup

In [2]:
# load environment variables 
load_dotenv()

# get Spotify API login credentials 
cid = os.getenv("cid")
secret = os.getenv("secret")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= cid,
                                                           client_secret= secret))

# Inputs

In [3]:
# take user input from Spotify 
input_url = input('Input Spotify song URL: ')
# find audio features of user's track
pp_input_df = sp.audio_features(input_url)
p_input_df = pd.DataFrame.from_dict(pp_input_df)
input_df = p_input_df[['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo', 'time_signature']]

# TEST CODE 
# input_df

Input Spotify song URL: https://open.spotify.com/track/560Ac5wm8qT64yGNRKRhpN?si=c714302904844893


# Data Frame Preparation   

##### Spotify song features dataset link: 
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [4]:
# read in Spotify data set and return cleaned DataFrame 
df2 = pd.read_csv('tracks_features.csv')
df3 = df2[['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo', 'time_signature']]
df4 = pd.concat([input_df, df3]).reset_index(drop=True)

# TEST CODE 
# df2 = df1.truncate(after = 200)
# df4.head()

### Create a smaller dataframe to test genre functions 
#### A full-scale implementation is currently being worked on 

In [5]:
df4a = df4.truncate(after = 200)

# TEST CODE
df4a.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,time_signature
0,560Ac5wm8qT64yGNRKRhpN,0.521,0.55,1,-6.222,1,0.0709,0.0029,0.171,0.0391,184.018,4.0
1,7lmeHLHBe4nmXzuXc0HDjk,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.503,117.906,4.0
2,1wsRitfRRtWyEapl0q22o8,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.489,103.68,4.0
3,1hR0fIFK2qRG3f3RF70pb7,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.37,149.749,4.0
4,2lbASgTSoDO7MTuLAXlTW0,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.574,96.752,4.0


In [6]:
# add column to populate 
df4a["artist_genre"] = df4a.apply(lambda _: ' ', axis=1)
# appending list in dataframe; need to set type as 'obeject'
df4a['artist_genre'] = df4a['artist_genre'].astype('object')


In [7]:
# fetch genres using song ids and populate dataframe with genres
# this function is very slow. it will be improved when adapting this program to the full dataset 

for i in range(len(df4a.index)): 
    trackid1 = df4a.loc[i, 'id']
    
    track_info = sp.track(trackid1)
    artist_id = track_info.get('artists')[0]['id']
    genre_list = sp.artist(artist_id)['genres']
    
    # populate column with list  
    df4a.at[i, 'artist_genre'] = genre_list    
    
# fills null values 
df4a['artist_genre'] = df4a['artist_genre'].apply(lambda d: d if isinstance(d, list) else [])

# TEST CODE
df4a.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,time_signature,artist_genre
0,560Ac5wm8qT64yGNRKRhpN,0.521,0.55,1,-6.222,1,0.0709,0.0029,0.171,0.0391,184.018,4.0,"[drum and bass, edm, electro house, melodic du..."
1,7lmeHLHBe4nmXzuXc0HDjk,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.503,117.906,4.0,"[alternative metal, alternative rock, consciou..."
2,1wsRitfRRtWyEapl0q22o8,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.489,103.68,4.0,"[alternative metal, alternative rock, consciou..."
3,1hR0fIFK2qRG3f3RF70pb7,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.37,149.749,4.0,"[alternative metal, alternative rock, consciou..."
4,2lbASgTSoDO7MTuLAXlTW0,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.574,96.752,4.0,"[alternative metal, alternative rock, consciou..."


#### The genre column is a list of genres, to access this list and compare genre values we need each genre to be a separate column

In [8]:
# create genre_upd column
df4a["genre_upd"] = df4a.apply(lambda _: ' ', axis=1)

# populate column with space removed genres
for i in range(len(df4a.index)): 
    strip_genre = []
    for j in range(len(df4a.loc[i]['artist_genre'])):
        list_ele = df4a.loc[i]['artist_genre'][j]
        fixed_ele = list_ele.replace(" ", "_")
        strip_genre.append(fixed_ele)
    df4a.at[i, 'genre_upd'] = strip_genre
    
# TEST CODE
#ITSASTRING=(df4a.loc[1]['artist_genre'][0])
#string = ITSASTRING.replace(" ", "_")
#print(df4a.loc[1]['artist_genre'][0])
#print(string)
df4a.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,time_signature,artist_genre,genre_upd
0,560Ac5wm8qT64yGNRKRhpN,0.521,0.55,1,-6.222,1,0.0709,0.0029,0.171,0.0391,184.018,4.0,"[drum and bass, edm, electro house, melodic du...","[drum_and_bass, edm, electro_house, melodic_du..."
1,7lmeHLHBe4nmXzuXc0HDjk,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.503,117.906,4.0,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."
2,1wsRitfRRtWyEapl0q22o8,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.489,103.68,4.0,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."
3,1hR0fIFK2qRG3f3RF70pb7,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.37,149.749,4.0,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."
4,2lbASgTSoDO7MTuLAXlTW0,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.574,96.752,4.0,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."


## Genre Dummy Variable encoding
#### We want to create dummy variables so that genre vectors are compared with the cosine similarity function 

In [11]:
# apply TFIDF vectorizer to genre lists 
tfidf = TfidfVectorizer()
tfidf_matrix =  tfidf.fit_transform(df4a['genre_upd'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]
genre_df.reset_index(drop = True, inplace=True)

# add ID column to new TFIDF genre dataframe
genre_df["id"] = genre_df.apply(lambda _: ' ', axis=1)

for i in range(len(df4a.index)): 
    ids = df4a.loc[i, 'id']
    genre_df.at[i, 'id'] = ids

# TEST CODE
# pd.set_option('display.max_columns', None)
# genre_df = genre_df.reindex(sorted(genre_df.columns), axis=1)
genre_df.head()

Unnamed: 0,genre|album_rock,genre|alternative_metal,genre|alternative_rock,genre|anti,genre|buffalo_ny_indie,genre|canadian_singer,genre|christian_metal,genre|christian_punk,genre|classic_canadian_rock,genre|classic_rock,...,genre|roots_rock,genre|s_music,genre|screamo,genre|singer,genre|skate_punk,genre|soft_rock,genre|songwriter,genre|women,genre|yacht_rock,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,560Ac5wm8qT64yGNRKRhpN
1,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7lmeHLHBe4nmXzuXc0HDjk
2,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1wsRitfRRtWyEapl0q22o8
3,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1hR0fIFK2qRG3f3RF70pb7
4,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2lbASgTSoDO7MTuLAXlTW0


## Feature Scaling
### MinMaxScaler scales audio features to the range of 0 and 1. 
#### - Scaling all audio features to the range of 0 and 1 ensure that each metric is weighted on the same scale when finding recommendations. 

In [15]:
# select columns to apply feature scaling to 
columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo', 'time_signature']

# scale selected columns
scaler = preprocessing.MinMaxScaler()
df4a[columns] = scaler.fit_transform(df4a[columns])


# TEST CODE 
df4a.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,time_signature,artist_genre,genre_upd
0,560Ac5wm8qT64yGNRKRhpN,0.466667,0.527473,0.090909,0.87575,1.0,0.060128,0.002963,0.190423,0.0,0.884625,0.5,"[drum and bass, edm, electro house, melodic du...","[drum_and_bass, edm, electro_house, melodic_du..."
1,7lmeHLHBe4nmXzuXc0HDjk,0.387597,0.997802,0.636364,0.91375,1.0,0.062436,0.026685,1.2e-05,0.496203,0.450891,0.5,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."
2,1wsRitfRRtWyEapl0q22o8,0.587597,0.974725,1.0,0.896897,1.0,0.210256,0.013188,7.9e-05,0.481228,0.357559,0.5,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."
3,1hR0fIFK2qRG3f3RF70pb7,0.147287,0.989011,0.636364,0.912596,1.0,0.588462,0.023924,2e-06,0.353942,0.6598,0.5,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."
4,2lbASgTSoDO7MTuLAXlTW0,0.341085,0.985714,1.0,0.89385,0.0,0.273077,0.166665,4e-06,0.572147,0.312108,0.5,"[alternative metal, alternative rock, consciou...","[alternative_metal, alternative_rock, consciou..."


## Calculating Cosine Similarity
### Cosine_similarity finds how similar the user input song is to each song in the DataFrame
#### - Cosine similarity calculates how similar the input track is to each track in the DataFrame by finding the difference between the input audio features vector and each DataFrame audio features vector using the formula: (A.B) / (||A||.||B||).

In [17]:
# combine genre and song attribute dataframes
df4b = df4a.merge(genre_df, how = 'left')
# drop columns used to genereate genre dataframe
df4b = df4b.drop(columns=['artist_genre', 'genre_upd'])

# TEST CODE
pd.set_option('display.max_columns', None)
df4b.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,time_signature,genre|album_rock,genre|alternative_metal,genre|alternative_rock,genre|anti,genre|buffalo_ny_indie,genre|canadian_singer,genre|christian_metal,genre|christian_punk,genre|classic_canadian_rock,genre|classic_rock,genre|conscious_hip_hop,genre|country_rock,genre|dance_pop,genre|dark_pop,genre|deep_christian_rock,genre|drum_and_bass,genre|ectofolk,genre|edm,genre|electro_house,genre|emo,genre|europop,genre|folk,genre|folk_rock,genre|funk_metal,genre|kentucky_metal,genre|lilith,genre|melancholia,genre|mellow_gold,genre|melodic_dubstep,genre|new_wave_pop,genre|nu_metal,genre|permanent_wave,genre|piano_rock,genre|political_hip_hop,genre|pop_punk,genre|pop_rock,genre|punk,genre|rap_metal,genre|rap_rock,genre|rock,genre|roots_rock,genre|s_music,genre|screamo,genre|singer,genre|skate_punk,genre|soft_rock,genre|songwriter,genre|women,genre|yacht_rock
0,560Ac5wm8qT64yGNRKRhpN,0.466667,0.527473,0.090909,0.87575,1.0,0.060128,0.002963,0.190423,0.0,0.884625,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7lmeHLHBe4nmXzuXc0HDjk,0.387597,0.997802,0.636364,0.91375,1.0,0.062436,0.026685,1.2e-05,0.496203,0.450891,0.5,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299523,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.352315,0.0,0.0,0.0,0.352315,0.299523,0.275346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1wsRitfRRtWyEapl0q22o8,0.587597,0.974725,1.0,0.896897,1.0,0.210256,0.013188,7.9e-05,0.481228,0.357559,0.5,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299523,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.352315,0.0,0.0,0.0,0.352315,0.299523,0.275346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1hR0fIFK2qRG3f3RF70pb7,0.147287,0.989011,0.636364,0.912596,1.0,0.588462,0.023924,2e-06,0.353942,0.6598,0.5,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299523,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.352315,0.0,0.0,0.0,0.352315,0.299523,0.275346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2lbASgTSoDO7MTuLAXlTW0,0.341085,0.985714,1.0,0.89385,0.0,0.273077,0.166665,4e-06,0.572147,0.312108,0.5,0.0,0.352315,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299523,0.0,0.0,0.0,0.0,0.0,0.0,0.352315,0.0,0.0,0.352315,0.0,0.0,0.0,0.352315,0.299523,0.275346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# create and populate column in DataFrame to hold cosine similarity calculations
df4b['distances'] = cosine_similarity(df4b.iloc[0:,1:], df4b.iloc[0:1,1:])

# select number of songs to display in output 
n = 10
n_plus1 = n + 1
n_largest = df4b.nlargest((n_plus1), 'distances')


# TEST CODE
print(df4b['distances'])

0      1.000000
1      0.684385
2      0.648853
3      0.685118
4      0.465634
         ...   
196    0.707534
197    0.461794
198    0.633029
199    0.724538
200    0.557000
Name: distances, Length: 201, dtype: float64


# Outputs 

In [19]:
# create DataFrame to hold top n track audio features
final_df = []

for i in n_largest['id']:
    final_df.append(
    
        sp.audio_features(n_largest['id'])
    )


# create DataFrame to hold top n track IDs
final_df1 = []
    
for i in range(n_plus1):
    final_df1.append(
        final_df[0][i]['id']
    )


# TEST CODE
# final_df

In [20]:
# create empty DataFrame to hold output  
result = pd.DataFrame(columns = ['ID', 'Title', 'Artist', 'Year', 'URI'])

# populate output DataFrame
for i in range(n_plus1):
    r_id = final_df1[i] 
    track_detail = sp.track(r_id)
    
    r_name = track_detail.get('name')
    r_artist = track_detail.get('artists')[0]['name']
    r_uri = track_detail.get('uri')
    r_year = track_detail.get('album')['release_date']
    
    r_list = [r_id, r_name, r_artist, r_year, r_uri]
    result.loc[len(result)] = r_list

# display output DataFrame
display(result)

Unnamed: 0,ID,Title,Artist,Year,URI
0,560Ac5wm8qT64yGNRKRhpN,Dreams,NERO,2017-08-14,spotify:track:560Ac5wm8qT64yGNRKRhpN
1,2nVW5BVneWEVqhfOzyNxwo,Tired of Waiting,Death & Desire,2011-02-08,spotify:track:2nVW5BVneWEVqhfOzyNxwo
2,6zLpXc5KrSN53HTviwOfgL,Desperate,Death & Desire,2011-02-08,spotify:track:6zLpXc5KrSN53HTviwOfgL
3,0C5pv62j5dcazPrml3oGnE,All I See,Death & Desire,2011-02-08,spotify:track:0C5pv62j5dcazPrml3oGnE
4,4FKggErzionSD7MdrEJPf2,Give,Death & Desire,2011-02-08,spotify:track:4FKggErzionSD7MdrEJPf2
5,7tPDxnbL6pqGd8wesq9HLl,Pain,Death & Desire,2011-02-08,spotify:track:7tPDxnbL6pqGd8wesq9HLl
6,11cxKUEgnVAlesUKt4e3br,Bullet In the Head,Rage Against The Machine,1992-11-03,spotify:track:11cxKUEgnVAlesUKt4e3br
7,2LXPNLSMAauNJfnC58lSqY,Born of a Broken Man,Rage Against The Machine,1999-11-02,spotify:track:2LXPNLSMAauNJfnC58lSqY
8,7eTTBAIXL8BEw9Uj6iyzFn,Tell Me,Bride,1997-01-01,spotify:track:7eTTBAIXL8BEw9Uj6iyzFn
9,4RIyExqhnXEe3dHUNMGbwT,The Arrivals Gate,Ani DiFranco,1999-11-16,spotify:track:4RIyExqhnXEe3dHUNMGbwT


#### These are certainly not the best recommendations. This is because these recommendations are generated from a testing model containing a dataframe of 200 songs, a better model would be generated from the entire dataset containing 1.2M songs