In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [4]:
spotify_df = pd.read_csv('spotify-2023.csv', encoding='latin-1')

In [5]:
spotify_df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [6]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

In [12]:
# remove records with null values
spotify_df.dropna(inplace=True)

In [10]:
spotify_df.columns

Index(['track_name', 'artist(s)_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'in_spotify_playlists',
       'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts',
       'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm',
       'key', 'mode', 'danceability_%', 'valence_%', 'energy_%',
       'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%'],
      dtype='object')

In [14]:
# define feature set for knn
X = spotify_df[['bpm',
       'key', 'mode', 'danceability_%', 'valence_%', 'energy_%',
       'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']]

In [15]:
# label encode key and mode
le = LabelEncoder()
X['key'] = le.fit_transform(X['key'])
X['mode'] = le.fit_transform(X['mode'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['key'] = le.fit_transform(X['key'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['mode'] = le.fit_transform(X['mode'])


In [27]:
# apply Standard scaler to features
scaler = MinMaxScaler().fit(X)
X_scaled_array = scaler.transform(X)

In [28]:
X_scaled_df = pd.DataFrame(X_scaled_array, columns=X.columns)

In [29]:
X_scaled_df.head()

Unnamed: 0,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,0.425532,0.2,0.0,0.780822,0.913978,0.831325,0.319588,0.0,0.053191,0.032258
1,0.191489,0.3,0.0,0.657534,0.612903,0.722892,0.072165,0.0,0.074468,0.032258
2,0.51773,0.7,0.0,0.383562,0.301075,0.46988,0.175258,0.0,0.297872,0.064516
3,0.744681,0.0,0.0,0.438356,0.580645,0.698795,0.113402,0.0,0.085106,0.209677
4,0.560284,0.0,1.0,0.575342,0.204301,0.795181,0.14433,0.692308,0.085106,0.064516


In [31]:
# train knn with kd_tree algorithm
knn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree').fit(X_scaled_df)

In [36]:
import pickle

In [37]:
# serialize knn model
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn, f)