In [125]:
#Import the necessary libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
sns.set()
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans


In [126]:
#Laod the dataset
data = pd.read_csv('spotify_data.csv')
data.columns


Index(['acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
       'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo',
       'valence', 'year'],
      dtype='object')

In [127]:
#Data Preprocessing
data = data[data['year']>=2000]

In [128]:
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
6054,0.972,['David Bauer'],0.567,313293,0.227,0,0w0D8H1ubRerCXHWYJkinO,0.601,10,0.11,-13.441,1,Shout to the Lord,47,2000,0.029,136.123,0.0396,2000
6055,0.321,['Etta James'],0.821,360240,0.418,0,4JVeqfE2tpi7Pv63LJZtPh,0.000372,9,0.222,-9.841,0,Miss You,51,12/12/2000,0.0407,117.382,0.803,2000
6056,0.00659,['Quasimoto'],0.706,202507,0.602,1,5pxtdhLAi0RTh1gNqhGMNA,0.000138,11,0.4,-8.306,0,Real Eyes,44,6/13/2000,0.342,89.692,0.479,2000
6057,0.0039,['Millencolin'],0.368,173360,0.977,0,3jRsoe4Vkxa4BMYqGHX8L0,0.0,11,0.35,-2.757,0,Penguins & Polarbears,52,2/22/2000,0.127,165.889,0.548,2000
6058,0.122,['Steve Chou'],0.501,344200,0.511,0,4mronxcllhfyhBRqyZi8kU,0.0,7,0.279,-9.836,0,黃昏,53,12/25/2000,0.0291,78.045,0.113,2000


In [129]:
df = data.drop(columns=['id','name','artists','release_date','year'], axis=1)

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41656 entries, 6054 to 169908
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      41656 non-null  float64
 1   danceability      41656 non-null  float64
 2   duration_ms       41656 non-null  int64  
 3   energy            41656 non-null  float64
 4   explicit          41656 non-null  int64  
 5   instrumentalness  41656 non-null  float64
 6   key               41656 non-null  int64  
 7   liveness          41656 non-null  float64
 8   loudness          41656 non-null  float64
 9   mode              41656 non-null  int64  
 10  popularity        41656 non-null  int64  
 11  speechiness       41656 non-null  float64
 12  tempo             41656 non-null  float64
 13  valence           41656 non-null  float64
dtypes: float64(9), int64(5)
memory usage: 4.8 MB


In [131]:
print("Check if there are any null values in the columns:")
print(df.isnull().sum())

Check if there are any null values in the columns:
acousticness        0
danceability        0
duration_ms         0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
popularity          0
speechiness         0
tempo               0
valence             0
dtype: int64


In [132]:
df.corr()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
acousticness,1.0,-0.181267,-0.02272,-0.704208,-0.147271,0.287628,-0.018452,-0.088294,-0.593247,0.076514,-0.010563,-0.101049,-0.157583,-0.189472
danceability,-0.181267,1.0,-0.142586,0.104212,0.284077,-0.315073,0.020736,-0.10003,0.293498,-0.070747,0.144195,0.215453,-0.084708,0.493865
duration_ms,-0.02272,-0.142586,1.0,-0.008007,-0.042391,0.079613,-0.004897,0.035549,-0.049672,-0.023657,-0.108634,-0.029919,-0.02768,-0.158049
energy,-0.704208,0.104212,-0.008007,1.0,0.06484,-0.298454,0.035948,0.177244,0.74165,-0.06671,-0.055513,0.11321,0.204833,0.347885
explicit,-0.147271,0.284077,-0.042391,0.06484,1.0,-0.149461,0.000971,0.052678,0.11527,-0.116557,0.169427,0.488296,-0.011252,-0.023629
instrumentalness,0.287628,-0.315073,0.079613,-0.298454,-0.149461,1.0,-0.025166,-0.028174,-0.562442,-0.011803,-0.05232,-0.13176,-0.115032,-0.268455
key,-0.018452,0.020736,-0.004897,0.035948,0.000971,-0.025166,1.0,-0.00077,0.034144,-0.131582,-0.008489,0.014806,0.007479,0.047671
liveness,-0.088294,-0.10003,0.035549,0.177244,0.052678,-0.028174,-0.00077,1.0,0.071872,-0.011441,-0.052588,0.128764,0.00409,-0.00293
loudness,-0.593247,0.293498,-0.049672,0.74165,0.11527,-0.562442,0.034144,0.071872,1.0,-0.038965,0.028191,0.079785,0.202757,0.339367
mode,0.076514,-0.070747,-0.023657,-0.06671,-0.116557,-0.011803,-0.131582,-0.011441,-0.038965,1.0,-0.060753,-0.108349,0.017327,0.000113


In [133]:
#Normalize data by using MinMaxScaler
datatypes = ['int16', 'int32','int64','float16','float32','float64']
normalized = df.select_dtypes(include=datatypes)
for col in normalized.columns:
    MinMaxScaler(col)

In [145]:
#Create new feature to differentiate between songs from different categories
kmeans = KMeans(n_clusters=25)
features = kmeans.fit_predict(normalized)
df['features'] = features
MinMaxScaler(df['features'])

MinMaxScaler(feature_range=6054       6
6055      10
6056      24
6057       4
6058      10
          ..
169904    21
169905    21
169906     4
169907    21
169908    22
Name: features, Length: 41656, dtype: int32)

In [146]:
#Code the class function for recommending songs based on input
class Song_Recommendation():
    def __init__(self,dataset):
        self.dataset = dataset
    def recommend(self, songs, amount=1):
        distance = []
        song = self.dataset[(self.dataset.name.str.lower() == songs.lower())].head(1).values[0]
        rec = self.dataset[self.dataset.name.str.lower() != songs.lower()]
        for songs in tqdm(rec.values):
            d = 0
            for col in np.arange(len(rec.columns)):
                if not col in [1, 6, 12, 14, 18]:
                    d = d + np.absolute(float(song[col]) - float(songs[col]))
            distance.append(d)
        rec['distance'] = distance
        rec = rec.sort_values('distance')
        columns = ['artists', 'name']
        return rec[columns][:amount]



In [148]:
#Test
model = Song_Recommendation(data)
model.recommend("The Nights", 15)

100%|█████████████████████████████████████████████████████████████████████████| 41655/41655 [00:01<00:00, 31247.17it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec['distance'] = distance


Unnamed: 0,artists,name
116317,['Travis Scott'],A-Team
107207,"['NLE Choppa', 'Blueface']",Shotta Flow (Feat. Blueface) [Remix]
97644,"['YNW Melly', '9lokknine']",223's (feat. 9lokknine)
87732,"['GoldLink', 'Brent Faiyaz', 'Shy Glizzy']",Crew (feat. Brent Faiyaz & Shy Glizzy)
152850,['J Balvin'],Gris
107165,"['YNW Melly', '9lokknine']",223's (feat. 9lokknine)
159888,"[""Los Player's""]",Ya Para Qué (Para Qué)
116140,['Rich Homie Quan'],"Flex (Ooh, Ooh, Ooh)"
39982,['Shawn Mendes'],Use Somebody - Recorded at Spotify Studios NYC
63091,['Yungeen Ace'],Step Harder
