In [1]:
import pandas as pd
from IPython.display import display, Markdown

df = pd.read_csv("../data/raw_data/-spotify-tracks-dataset/dataset.csv")
display(Markdown("**Loading Spotify Tracks Dataset from Kaggle:**\n"))
print(f"Rows count: {df.shape[0]}  |  Columns count: {df.shape[1]}")
display(Markdown("### Overview of dataset"))
df_cols = df.columns
df_cols_str = " ".join([f"`{c}`" for c in df_cols])
display(Markdown("**Columns:**<br>" + df_cols_str))

display(Markdown("**First 5 rows of dataset:**"))
display(df.head(5))

**Loading Spotify Tracks Dataset from Kaggle:**


Rows count: 114000  |  Columns count: 21


### Overview of dataset

**Columns:**<br>`Unnamed: 0` `track_id` `artists` `album_name` `track_name` `popularity` `duration_ms` `explicit` `danceability` `energy` `key` `loudness` `mode` `speechiness` `acousticness` `instrumentalness` `liveness` `valence` `tempo` `time_signature` `track_genre`

**First 5 rows of dataset:**

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [6]:
display(Markdown("### Forming main dataset X with important informations for Clustering:\n"))

main_features = [
    'danceability',
    'energy',
    'loudness',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'valence',
    'tempo',
    'duration_ms'
]
display(Markdown("***Features- mode, key and liveness are left out because they could add too much noise***"))


X = df[main_features].dropna()
X_cols = X.columns
X_cols_str = " ".join([f"`{c}`" for c in X_cols])
display(Markdown("**Columns:**<br>" + X_cols_str))
display(Markdown("**First 5 rows of dataset:**"))
display(X.head(5))

### Forming main dataset X with important informations for Clustering:


***Features- mode, key and liveness are left out because they could add too much noise***

**Columns:**<br>`danceability` `energy` `loudness` `speechiness` `acousticness` `instrumentalness` `valence` `tempo` `duration_ms`

**First 5 rows of dataset:**

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo,duration_ms
0,0.676,0.461,-6.746,0.143,0.0322,1e-06,0.715,87.917,230666
1,0.42,0.166,-17.235,0.0763,0.924,6e-06,0.267,77.489,149610
2,0.438,0.359,-9.734,0.0557,0.21,0.0,0.12,76.332,210826
3,0.266,0.0596,-18.515,0.0363,0.905,7.1e-05,0.143,181.74,201933
4,0.618,0.443,-9.681,0.0526,0.469,0.0,0.167,119.949,198853


In [3]:
display(Markdown("### Forming extended dataset Y with additional informations - mode, key and liveness:\n"))
extended_features = main_features + ['mode', 'key', 'liveness']

Y = df[extended_features].dropna()
Y_cols = Y.columns
Y_cols_str = " ".join([f"`{c}`" for c in Y_cols])
display(Markdown("**Columns:**<br>" + Y_cols_str))

display(Markdown("**First 5 rows of dataset:**"))
display(Y.head(5))

### Forming extended dataset Y with additional informations - mode, key and liveness:


**Columns:**<br>`danceability` `energy` `loudness` `speechiness` `acousticness` `instrumentalness` `valence` `tempo` `duration_ms` `mode` `key` `liveness`

**First 5 rows of dataset:**

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo,duration_ms,mode,key,liveness
0,0.676,0.461,-6.746,0.143,0.0322,1e-06,0.715,87.917,230666,0,1,0.358
1,0.42,0.166,-17.235,0.0763,0.924,6e-06,0.267,77.489,149610,1,1,0.101
2,0.438,0.359,-9.734,0.0557,0.21,0.0,0.12,76.332,210826,1,0,0.117
3,0.266,0.0596,-18.515,0.0363,0.905,7.1e-05,0.143,181.74,201933,1,0,0.132
4,0.618,0.443,-9.681,0.0526,0.469,0.0,0.167,119.949,198853,1,2,0.0829


In [4]:
from sklearn.preprocessing import StandardScaler

display(Markdown("### Scaling datasets to use for K-means Clustering\n"))
scaler_main = StandardScaler()
X_scaled = scaler_main.fit_transform(X)

scaler_ext = StandardScaler()
Y_scaled = scaler_ext.fit_transform(Y)

display(Markdown("**Data normalized with StandardScaler:**"))
print(f"X_scaled shape: {X_scaled.shape}")
print(f"Y_scaled shape: {Y_scaled.shape}")

### Scaling datasets to use for K-means Clustering


**Data normalized with StandardScaler:**

X_scaled shape: (114000, 9)
Y_scaled shape: (114000, 12)


In [5]:
import numpy as np
import os

clean_features = list(set(extended_features))
df_clean = df.dropna(subset=clean_features).reset_index(drop=True)
print(f"Original rows: {len(df)}  |  Cleaned rows: {len(df_clean)}")

os.makedirs("../data/processed_data", exist_ok=True)

df_clean.to_csv("../data/processed_data/spotify_clean.csv", index=False)
np.save("../data/processed_data/X_scaled.npy", X_scaled)
np.save("../data/processed_data/Y_scaled.npy", Y_scaled)

print("Datasets saved.")


Original rows: 114000  |  Cleaned rows: 114000
