In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cdist
import numpy as np
import os

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'

mixed_playlist = pd.read_csv('data/mixed_playlist.csv')


In [2]:
# To display categories "Users" that exists in the column user
num_categories = mixed_playlist.user.unique()

In [3]:
mapping = {'unknown': 0, 'delta': 1, 'gamma': 2, 'alpha': 3, 'dzeta': 4, 'beta': 5, 'epsilon': 6}
mixed_playlist['user'] = mixed_playlist['user'].replace(mapping)
mixed_playlist.user

0       0
1       1
2       2
3       2
4       3
       ..
3894    5
3895    1
3896    4
3897    5
3898    0
Name: user, Length: 3899, dtype: int64

In [4]:
# Replace 'unknown' values in 'top_year' column with 0
mixed_playlist['top_year'] = mixed_playlist['top_year'].replace('unknown', 0)

mixed_playlist.top_year

0          0
1       2022
2       2020
3       2018
4       2020
        ... 
3894    2020
3895    2020
3896    2021
3897    2018
3898       0
Name: top_year, Length: 3899, dtype: object

In [5]:
mixed_playlist.info()
mixed_playlist.dropna(inplace=True)

# display the descriptive statistics of numeric variables
mixed_playlist.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3899 entries, 0 to 3898
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3890 non-null   object 
 1   album             3890 non-null   object 
 2   artist            3899 non-null   object 
 3   release_date      3899 non-null   object 
 4   length            3899 non-null   int64  
 5   popularity        3899 non-null   int64  
 6   acousticness      3899 non-null   float64
 7   danceability      3899 non-null   float64
 8   energy            3899 non-null   float64
 9   instrumentalness  3899 non-null   float64
 10  liveness          3899 non-null   float64
 11  loudness          3899 non-null   float64
 12  speechiness       3899 non-null   float64
 13  tempo             3899 non-null   float64
 14  valence           3899 non-null   float64
 15  time_signature    3899 non-null   int64  
 16  key               3899 non-null   int64  


Unnamed: 0,length,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,time_signature,key,mode,release_year,user
count,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0
mean,244418.7,32.005398,0.42512,0.554521,0.523066,0.296768,0.167134,-10.436968,0.085179,115.826347,0.384641,3.873779,5.29563,0.466838,2012.748072,3.357326
std,104574.7,24.920088,0.363021,0.192029,0.265409,0.386456,0.143656,6.209512,0.092698,28.662454,0.254691,0.465588,3.507038,0.498963,10.418784,1.844656
min,31053.0,0.0,1e-06,0.0,0.000545,0.0,0.0179,-42.117,0.0,0.0,0.0,0.0,0.0,0.0,1957.0,0.0
25%,188033.2,1.0,0.058025,0.42425,0.313,4e-06,0.0945,-12.79,0.0368,94.54675,0.161,4.0,2.0,0.0,2012.0,2.0
50%,224248.5,34.0,0.339,0.575,0.5475,0.00573,0.112,-8.589,0.047,115.1515,0.355,4.0,5.0,0.0,2016.0,3.0
75%,274490.0,51.0,0.793,0.701,0.73875,0.773,0.172,-6.2225,0.086775,131.98675,0.572,4.0,8.0,1.0,2018.0,5.0
max,1921683.0,91.0,0.996,0.971,0.995,0.995,0.991,0.532,0.952,209.596,0.981,5.0,11.0,1.0,2022.0,6.0


In [7]:
# mixed_playlist

In [6]:
# display the correlation matrix between numeric variables
corr = mixed_playlist.corr()
corr.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,length,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,time_signature,key,mode,release_year,user
length,1.0,-0.095678,0.00843,-0.12779,-0.025789,0.168193,-0.012691,-0.069277,-0.034805,-0.003102,-0.196543,-0.02302,-0.026732,0.018647,-0.134922,-0.002029
popularity,-0.095678,1.0,-0.031579,0.059827,0.049838,-0.130565,-0.053067,0.095978,0.039192,0.022633,0.034257,-0.013903,0.004236,-0.008805,0.018607,-0.114075
acousticness,0.00843,-0.031579,1.0,-0.385711,-0.799091,0.163068,-0.118148,-0.688113,-0.163287,-0.186696,-0.270082,-0.225935,-0.041198,-0.014634,-0.194986,-0.165433
danceability,-0.12779,0.059827,-0.385711,1.0,0.420806,-0.371521,0.007763,0.48968,0.250873,0.084809,0.573579,0.242775,0.024329,-0.015994,0.12288,-0.080037
energy,-0.025789,0.049838,-0.799091,0.420806,1.0,-0.214224,0.160296,0.827295,0.231508,0.207777,0.422506,0.229897,0.053166,-0.021008,0.203005,0.121438
instrumentalness,0.168193,-0.130565,0.163068,-0.371521,-0.214224,1.0,-0.091973,-0.411947,-0.256769,-0.035079,-0.40729,-0.113255,-0.000996,0.004675,0.012262,0.171177
liveness,-0.012691,-0.053067,-0.118148,0.007763,0.160296,-0.091973,1.0,0.095043,0.111733,0.037593,0.115369,-0.01328,0.008491,0.00652,-0.025289,-0.031247
loudness,-0.069277,0.095978,-0.688113,0.48968,0.827295,-0.411947,0.095043,1.0,0.191693,0.18457,0.42828,0.235168,0.039602,-0.005157,0.231127,0.048809
speechiness,-0.034805,0.039192,-0.163287,0.250873,0.231508,-0.256769,0.111733,0.191693,1.0,0.036513,0.250839,0.067924,0.017841,-0.033367,0.097793,-0.095971
tempo,-0.003102,0.022633,-0.186696,0.084809,0.207777,-0.035079,0.037593,0.18457,0.036513,1.0,0.089352,0.006864,0.019338,-0.057536,0.038141,0.01311


In [7]:
correlation_matrix = mixed_playlist.corr()
# You can set a threshold for correlation values and remove features with high correlation
threshold = 0.7  # Adjust this threshold as needed
highly_correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_features.add(colname)

mixed_playlist.drop(highly_correlated_features, axis=1, inplace=True)

In [1]:
# mixed_playlist

In [8]:
# It is considered Irrelevant features every description or string that does not contribute in understanding the nature of the song
irrelevant_features = ['name', 'album', 'artist','release_date','uri']  # Modify this list with your irrelevant feature names
mixed_playlist.drop(irrelevant_features, axis=1, inplace=True)


In [9]:
mixed_playlist

Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,76933,53,0.996000,0.329,0.866000,0.0906,0.0448,70.295,0.238,4,11,0,1996,0,0
1,172626,62,0.622000,0.615,0.000008,0.1920,0.2530,86.976,0.626,4,1,1,2018,2022,1
2,175269,72,0.413000,0.834,0.000040,0.1130,0.3410,89.989,0.356,4,6,0,2019,2020,2
3,175266,0,0.404000,0.797,0.000153,0.2550,0.0327,128.027,0.539,4,5,0,2018,2018,2
4,264735,53,0.061600,0.788,0.711000,0.1000,0.0318,107.993,0.525,4,7,0,2016,2020,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3894,324133,41,0.992000,0.529,0.922000,0.1100,0.0575,115.924,0.585,4,0,0,1962,2020,5
3895,185506,0,0.942000,0.417,0.001700,0.1250,0.0871,76.774,0.505,3,4,0,2019,2020,1
3896,352105,51,0.000016,0.467,0.908000,0.0856,0.0414,122.004,0.061,4,0,0,2017,2021,4
3897,154826,16,0.045300,0.589,0.008480,0.1020,0.0569,182.051,0.658,4,6,1,2017,2018,5


In [16]:
mixed_playlist['user'].value_counts().get(0)

100

In [11]:
# Extracting rows where 'user' is 0 into a new DataFrame
unknown_df = mixed_playlist[mixed_playlist['user'] == 0]

# Removing these rows from the original mixed_playlist DataFrame
mixed_playlist = mixed_playlist[mixed_playlist['user'] != 0]

# Displaying the first few rows of each DataFrame to confirm the operation
unknown_df_head = unknown_df.head()
mixed_playlist_head = mixed_playlist.head()

In [12]:
unknown_df_head

Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,76933,53,0.996,0.329,0.866,0.0906,0.0448,70.295,0.238,4,11,0,1996,0,0
75,183773,22,0.994,0.629,0.947,0.0864,0.0406,111.454,0.241,4,0,0,2018,0,0
80,551666,20,0.981,0.208,0.919,0.0843,0.0342,62.543,0.042,4,4,1,1998,0,0
92,321146,26,0.87,0.258,0.898,0.148,0.0381,98.874,0.0378,4,5,0,2019,0,0
150,1009706,27,0.913,0.502,0.857,0.0914,0.0473,105.74,0.559,4,7,0,1992,0,0


In [12]:
# # Feauture Scaling
# scaler = MinMaxScaler()
# mixed_playlist_scaled = scaler.fit_transform(mixed_playlist)
# mixed_playlist = pd.DataFrame(mixed_playlist_scaled, columns=mixed_playlist.columns)
# mixed_playlist


Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,0.024267,0.582418,1.000000,0.338826,0.870352,0.074710,0.047059,0.335383,0.242610,0.8,1.000000,0.0,0.600000,0.000000,0.000000
1,0.074881,0.681319,0.624497,0.633368,0.000008,0.178913,0.265756,0.414970,0.638124,0.8,0.090909,1.0,0.938462,1.000000,0.166667
2,0.076279,0.791209,0.414658,0.858908,0.000040,0.097729,0.358193,0.429345,0.362895,0.8,0.545455,0.0,0.953846,0.999011,0.333333
3,0.076278,0.000000,0.405622,0.820803,0.000154,0.243654,0.034349,0.610827,0.549439,0.8,0.454545,0.0,0.938462,0.998022,0.333333
4,0.123600,0.582418,0.061846,0.811535,0.714573,0.084370,0.033403,0.515244,0.535168,0.8,0.636364,0.0,0.907692,0.999011,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3885,0.155017,0.450549,0.995984,0.544799,0.926633,0.094646,0.060399,0.553083,0.596330,0.8,0.000000,0.0,0.076923,0.999011,0.833333
3886,0.081694,0.000000,0.945783,0.429454,0.001709,0.110061,0.091492,0.366295,0.514781,0.6,0.363636,0.0,0.953846,0.999011,0.166667
3887,0.169812,0.560440,0.000015,0.480947,0.912563,0.069571,0.043487,0.582091,0.062181,0.8,0.000000,0.0,0.923077,0.999505,0.666667
3888,0.065467,0.175824,0.045481,0.606591,0.008523,0.086425,0.059769,0.868581,0.670744,0.8,0.545455,1.0,0.923077,0.998022,0.833333


In [15]:
# K-Means clustering
X = mixed_playlist.values
model = KMeans(
    n_clusters = 6, # 6 clusters referred to 6 users 'delta': 1, 'gamma': 2, 'alpha': 3, 'dzeta': 4, 'beta': 5, 'epsilon': 6
    n_init = 'auto', # multiple attempt with different initial centroids
    random_state = 1, # fix ramdom state to guarantee reproducibility
    verbose = 1 # output log
)
model.fit(X)

Initialization complete
Iteration 0, inertia 3265.7445493617574.
Iteration 1, inertia 2317.2999583976016.
Iteration 2, inertia 2224.743561332685.
Iteration 3, inertia 2173.0526063718257.
Iteration 4, inertia 2150.3130571052707.
Iteration 5, inertia 2140.4745601340096.
Iteration 6, inertia 2135.5147457575576.
Iteration 7, inertia 2131.9030368468575.
Iteration 8, inertia 2129.755640896196.
Iteration 9, inertia 2127.7029148495662.
Iteration 10, inertia 2125.324231946084.
Iteration 11, inertia 2123.155279879134.
Iteration 12, inertia 2119.311176100692.
Iteration 13, inertia 2111.676113225555.
Iteration 14, inertia 2103.895951510157.
Iteration 15, inertia 2094.537046509772.
Iteration 16, inertia 2087.5700518286258.
Iteration 17, inertia 2084.0949590367422.
Iteration 18, inertia 2082.649917700795.
Iteration 19, inertia 2082.0990387620677.
Iteration 20, inertia 2082.0139785181586.
Iteration 21, inertia 2081.9986325135046.
Iteration 22, inertia 2081.9926300333095.
Converged at iteration 22: st

In [16]:
# obtain the cluster centers
model.cluster_centers_

array([[ 1.24691984e-01,  2.80229205e-01,  4.92879904e-01,
         4.68404661e-01,  8.44183180e-01,  1.38046381e-01,
         5.70203382e-02,  5.37977464e-01,  2.43086744e-01,
         7.61234991e-01,  4.54545455e-01,  1.00000000e+00,
         8.60878744e-01,  9.50781540e-01,  6.28359062e-01],
       [ 1.07171004e-01,  3.69287214e-01,  7.13417008e-01,
         5.60597620e-01,  4.85461232e-02,  1.54754386e-01,
         9.35371340e-02,  5.57134842e-01,  4.06326463e-01,
         7.57945425e-01,  4.80373559e-01,  4.99600361e-16,
         8.29213483e-01,  9.55309413e-01,  3.42964152e-01],
       [ 1.03506931e-01,  3.73385738e-01,  1.44747904e-01,
         6.78237375e-01,  6.86187429e-02,  1.69240651e-01,
         1.20925724e-01,  5.70199522e-01,  5.38350434e-01,
         7.97080292e-01,  5.68458306e-01, -4.44089210e-16,
         8.93542953e-01,  9.97329257e-01,  6.17396594e-01],
       [ 1.14593585e-01,  3.28877068e-01,  7.68805272e-01,
         5.33351044e-01,  4.83792502e-02,  1.48054208

In [17]:
cluster_centers = pd.DataFrame(
    model.cluster_centers_,
    columns = mixed_playlist.columns
)
cluster_centers

Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,0.124692,0.280229,0.49288,0.468405,0.844183,0.138046,0.05702,0.537977,0.243087,0.761235,0.454545,1.0,0.860879,0.950782,0.628359
1,0.107171,0.369287,0.713417,0.560598,0.048546,0.154754,0.093537,0.557135,0.406326,0.757945,0.480374,4.996004e-16,0.829213,0.955309,0.342964
2,0.103507,0.373386,0.144748,0.678237,0.068619,0.169241,0.120926,0.5702,0.53835,0.79708,0.568458,-4.440892e-16,0.893543,0.997329,0.617397
3,0.114594,0.328877,0.768805,0.533351,0.048379,0.148054,0.072646,0.519994,0.32922,0.765605,0.436402,1.0,0.811073,0.972936,0.5046
4,0.127517,0.313405,0.528056,0.45119,0.840092,0.128255,0.054369,0.549438,0.219203,0.758347,0.509756,4.996004e-16,0.862101,0.952801,0.655538
5,0.105349,0.414366,0.151327,0.664906,0.039655,0.170807,0.116431,0.563946,0.518249,0.793963,0.413386,1.0,0.86489,0.994543,0.576334


In [24]:
mixed_playlist

Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,0.024267,0.582418,1.000000,0.338826,0.870352,0.074710,0.047059,0.335383,0.242610,0.8,1.000000,0.0,0.600000,0.000000,0.000000
1,0.074881,0.681319,0.624497,0.633368,0.000008,0.178913,0.265756,0.414970,0.638124,0.8,0.090909,1.0,0.938462,1.000000,0.166667
2,0.076279,0.791209,0.414658,0.858908,0.000040,0.097729,0.358193,0.429345,0.362895,0.8,0.545455,0.0,0.953846,0.999011,0.333333
3,0.076278,0.000000,0.405622,0.820803,0.000154,0.243654,0.034349,0.610827,0.549439,0.8,0.454545,0.0,0.938462,0.998022,0.333333
4,0.123600,0.582418,0.061846,0.811535,0.714573,0.084370,0.033403,0.515244,0.535168,0.8,0.636364,0.0,0.907692,0.999011,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3885,0.155017,0.450549,0.995984,0.544799,0.926633,0.094646,0.060399,0.553083,0.596330,0.8,0.000000,0.0,0.076923,0.999011,0.833333
3886,0.081694,0.000000,0.945783,0.429454,0.001709,0.110061,0.091492,0.366295,0.514781,0.6,0.363636,0.0,0.953846,0.999011,0.166667
3887,0.169812,0.560440,0.000015,0.480947,0.912563,0.069571,0.043487,0.582091,0.062181,0.8,0.000000,0.0,0.923077,0.999505,0.666667
3888,0.065467,0.175824,0.045481,0.606591,0.008523,0.086425,0.059769,0.868581,0.670744,0.8,0.545455,1.0,0.923077,0.998022,0.833333


In [55]:

# Initialize an empty list to store predictions
predictions = []
# Loop through each row in the DataFrame
for n in range(len(mixed_playlist)):
    # Access the nth row and reshape it to a 2D array
    row_data = mixed_playlist.loc[n].values.reshape(1, -1)
    # Get the prediction and append it to the list
    prediction = model.predict(row_data)
    predictions.append(prediction[0])  # Assuming 'predict' returns a list or array
# Convert the list of predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])
# Now 'predictions_df' is a DataFrame with all predictions
print(predictions_df)

mixed_playlist

      Prediction
0              4
1              3
2              1
3              2
4              2
...          ...
3885           4
3886           1
3887           4
3888           5
3889           5

[3890 rows x 1 columns]


Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,0.024267,0.582418,1.000000,0.338826,0.870352,0.074710,0.047059,0.335383,0.242610,0.8,1.000000,0.0,0.600000,0.000000,0.000000
1,0.074881,0.681319,0.624497,0.633368,0.000008,0.178913,0.265756,0.414970,0.638124,0.8,0.090909,1.0,0.938462,1.000000,0.166667
2,0.076279,0.791209,0.414658,0.858908,0.000040,0.097729,0.358193,0.429345,0.362895,0.8,0.545455,0.0,0.953846,0.999011,0.333333
3,0.076278,0.000000,0.405622,0.820803,0.000154,0.243654,0.034349,0.610827,0.549439,0.8,0.454545,0.0,0.938462,0.998022,0.333333
4,0.123600,0.582418,0.061846,0.811535,0.714573,0.084370,0.033403,0.515244,0.535168,0.8,0.636364,0.0,0.907692,0.999011,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3885,0.155017,0.450549,0.995984,0.544799,0.926633,0.094646,0.060399,0.553083,0.596330,0.8,0.000000,0.0,0.076923,0.999011,0.833333
3886,0.081694,0.000000,0.945783,0.429454,0.001709,0.110061,0.091492,0.366295,0.514781,0.6,0.363636,0.0,0.953846,0.999011,0.166667
3887,0.169812,0.560440,0.000015,0.480947,0.912563,0.069571,0.043487,0.582091,0.062181,0.8,0.000000,0.0,0.923077,0.999505,0.666667
3888,0.065467,0.175824,0.045481,0.606591,0.008523,0.086425,0.059769,0.868581,0.670744,0.8,0.545455,1.0,0.923077,0.998022,0.833333


In [60]:
# mixed_playlist = scaler.inverse_transform(mixed_playlist)

# mixed_playlist_df = pd.DataFrame(mixed_playlist)

# mixed_playlist_df
