In [1]:
# Import Dependencies
import os
import numpy as np
import pandas as pd
import utils as utils
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

### Preprocessing

In [2]:
# Import and read the data
df_raw = pd.read_csv("data/top_10000_1960-now.csv")

In [3]:
df_raw.head()

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,Justified & Ancient - Stand by the Jams,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,spotify:album:4MC0ZjNtVP1nDD5lsLxFjc,Songs Collection,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,1992-08-03,https://i.scdn.co/image/ab67616d0000b27355346b...,...,0.048,0.0158,0.112,0.408,0.504,111.458,4.0,,Jams Communications,"C 1992 Copyright Control, P 1992 Jams Communic..."
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,I Know You Want Me (Calle Ocho),spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,spotify:album:5xLAcbvbSAlRtPXnKkggXA,Pitbull Starring In Rebelution,spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,2009-10-23,https://i.scdn.co/image/ab67616d0000b27326d73a...,...,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0,,Mr.305/Polo Grounds Music/J Records,"P (P) 2009 RCA/JIVE Label Group, a unit of Son..."
2,spotify:track:70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:3WNxdumkSMGMJRhEgK80qx,...Baby One More Time (Digital Deluxe Version),spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,1999-01-12,https://i.scdn.co/image/ab67616d0000b2738e4986...,...,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0,,Jive,P (P) 1999 Zomba Recording LLC
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,Apeman - 2014 Remastered Version,spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,spotify:album:6lL6HugNEN4Vlc8sj0Zcse,"Lola vs. Powerman and the Moneygoround, Pt. On...",spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,2014-10-20,https://i.scdn.co/image/ab67616d0000b2731e7c53...,...,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0,,Sanctuary Records,"C © 2014 Sanctuary Records Group Ltd., a BMG C..."
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,You Can't Always Get What You Want,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,spotify:album:0c78nsgqX6VfniSNWIxwoD,Let It Bleed,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,1969-12-05,https://i.scdn.co/image/ab67616d0000b27373d927...,...,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0,,Universal Music Group,"C © 2002 ABKCO Music & Records Inc., P ℗ 2002 ..."


In [4]:
# list columns for features and target
df_raw.columns

Index(['Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)',
       'Album URI', 'Album Name', 'Album Artist URI(s)',
       'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',
       'Disc Number', 'Track Number', 'Track Duration (ms)',
       'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',
       'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',
       'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',
       'Label', 'Copyrights'],
      dtype='object')

In [5]:
# Drop unnecessary columns
# Listing all columns for review
# all columns listed, columns to keep are commented out.
df_data = df_raw.drop(['Track URI',
                       #'Track Name',
                       'Artist URI(s)',
                       'Artist Name(s)',
                       'Album URI',
                       'Album Name',
                       'Album Artist URI(s)',
                       'Album Artist Name(s)',
                       'Album Release Date',
                       'Album Image URL',
                       'Disc Number',
                       'Track Number',
                       'Track Duration (ms)',
                       'Track Preview URL',
                       'Explicit',
                       'Popularity',
                       'ISRC',
                       'Added By',
                       'Added At',
                       'Artist Genres',
                       #'Danceability',
                       #'Energy',
                       #'Key',
                       'Loudness',
                       'Mode',
                       #'Speechiness',
                       #'Acousticness',
                       #'Instrumentalness',
                       #'Liveness',
                       #'Valence',
                       #'Tempo',
                       'Time Signature',
                       'Album Genres',
                       'Label',
                       'Copyrights'],
                       axis=1)

In [6]:
# Renaming columns to convention
df_data = df_data.rename(columns={
                   'Track Name': 'song',
                   'Album Image URL': 'image',
                   #'Explicit': 'explicit',
                   'Popularity': 'popularity',
                   'Danceability': 'danceability',
                   'Energy': 'energy',
                   'Key': 'key',
                   'Speechiness': 'speechiness',
                   'Acousticness': 'acousticness',
                   'Instrumentalness': 'instrumentalness',
                   'Liveness': 'liveness',
                   'Valence': 'valence',
                   'Tempo': 'tempo'
       })

In [7]:
# remove any duplicate track uri
df_data = df_data.drop_duplicates(subset=None, keep='first', inplace=False)

In [8]:
df_data.head()

Unnamed: 0,song,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Justified & Ancient - Stand by the Jams,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458
1,I Know You Want Me (Calle Ocho),0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045
2,From the Bottom of My Broken Heart,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981
3,Apeman - 2014 Remastered Version,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311
4,You Can't Always Get What You Want,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818


In [9]:
# Verifying name update and
# Reviewing datatypes for analysis
df_data.dtypes

song                 object
danceability        float64
energy              float64
key                 float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
dtype: object

In [10]:
# drop nulls
df_data = df_data.dropna()
# reset index
df_data = df_data.reset_index(drop=True)


In [11]:
# # Convert 'explicit' column from boolean to binary
# df_data['explicit'] = df_data['explicit'].astype(int)
# # verify update
# df_data.head()

In [12]:
df_data.head()

Unnamed: 0,song,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Justified & Ancient - Stand by the Jams,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458
1,I Know You Want Me (Calle Ocho),0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045
2,From the Bottom of My Broken Heart,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981
3,Apeman - 2014 Remastered Version,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311
4,You Can't Always Get What You Want,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818


In [16]:
df_data.reset_index()


Unnamed: 0,index,song,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,Justified & Ancient - Stand by the Jams,0.617,0.872,8.0,0.0480,0.015800,0.112000,0.4080,0.504,111.458
1,1,I Know You Want Me (Calle Ocho),0.825,0.743,2.0,0.1490,0.014200,0.000021,0.2370,0.800,127.045
2,2,From the Bottom of My Broken Heart,0.677,0.665,7.0,0.0305,0.560000,0.000001,0.3380,0.706,74.981
3,3,Apeman - 2014 Remastered Version,0.683,0.728,9.0,0.2590,0.568000,0.000051,0.0384,0.833,75.311
4,4,You Can't Always Get What You Want,0.319,0.627,0.0,0.0687,0.675000,0.000073,0.2890,0.497,85.818
...,...,...,...,...,...,...,...,...,...,...,...
9706,9706,Kernkraft 400 (A Better Day),0.623,0.727,11.0,0.0562,0.184000,0.000020,0.3090,0.400,125.975
9707,9707,Never Say Never - Radio Edit,0.720,0.841,9.0,0.0340,0.000354,0.011200,0.3380,0.767,130.978
9708,9708,Groovejet (If This Ain't Love) [feat. Sophie E...,0.719,0.806,9.0,0.0389,0.000132,0.088900,0.3610,0.626,123.037
9709,9709,Lay Low,0.534,0.855,1.0,0.1830,0.060700,0.000263,0.3460,0.420,122.060


In [18]:
df_data = df_data.rename(columns={'index': 'song_id'})
df_data.head()

Unnamed: 0,song,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Justified & Ancient - Stand by the Jams,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458
1,I Know You Want Me (Calle Ocho),0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045
2,From the Bottom of My Broken Heart,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981
3,Apeman - 2014 Remastered Version,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311
4,You Can't Always Get What You Want,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818


KNN Testing

In [13]:
df_data.head()

Unnamed: 0,song,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Justified & Ancient - Stand by the Jams,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458
1,I Know You Want Me (Calle Ocho),0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045
2,From the Bottom of My Broken Heart,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981
3,Apeman - 2014 Remastered Version,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311
4,You Can't Always Get What You Want,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818


In [None]:
# Extract relevant features
X = df_data.drop(columns='song')

KeyError: "['song_id'] not found in axis"

In [None]:
# Example target labels (binary classification)
# Replace this with your actual target column
y = df_data['track_uri']

In [None]:
# Normalize/Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Normalize/Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Gaussian Model

### what other models to test?
- gaussian?
- DBSCAN?
- agglomarative clustering?
