In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../data/top10s_final.csv", index_col=0)
dtypes = df.dtypes
numerical_cols = list(dtypes[dtypes != 'object'][1:].index)
categorical_cols = list(dtypes[dtypes == "object"].index) + ["year"]
df.shape

(603, 14)

In [3]:
df.head()

Unnamed: 0,title,artist,genre,year,tempo,energy,danceability,loudness,liveness,valence,length,acousticness,speechiness,popularity
1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
2,Love The Way You Lie,Eminem,hip hop,2010,87,93,75,-5,52,64,263,24,23,82
3,TiK ToK,Kesha,pop,2010,120,84,76,-3,29,71,200,10,14,80
4,Bad Romance,Lady Gaga,pop,2010,119,92,70,-4,8,71,295,0,4,79
5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [4]:
len(df.title.unique()) + len(df.artist.unique()) + len(df.genre.unique())

788

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603 entries, 1 to 603
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         603 non-null    object
 1   artist        603 non-null    object
 2   genre         603 non-null    object
 3   year          603 non-null    int64 
 4   tempo         603 non-null    int64 
 5   energy        603 non-null    int64 
 6   danceability  603 non-null    int64 
 7   loudness      603 non-null    int64 
 8   liveness      603 non-null    int64 
 9   valence       603 non-null    int64 
 10  length        603 non-null    int64 
 11  acousticness  603 non-null    int64 
 12  speechiness   603 non-null    int64 
 13  popularity    603 non-null    int64 
dtypes: int64(11), object(3)
memory usage: 70.7+ KB


There will be additional **789** features after OneHotEncoding these three features

In [19]:
sorted_years = df.year.sort_values().unique()
preprocessor = ColumnTransformer([
    ("scaler", StandardScaler(), list(range(4, 14))),
    ("ordinal", OrdinalEncoder(categories=[sorted_years]), [3])
], remainder="drop")
preprocessor.fit(df.values)

In [20]:
transformed_data = preprocessor.transform(df.values)
transformed_data.shape

(603, 11)

# Model Selection

## KMeans

In [25]:
model = KMeans(n_clusters=8, n_init="auto")
model.fit(transformed_data)
labels = model.predict(transformed_data)
silhouette_score(transformed_data, labels)

0.15161018786201555

## Agglomerative Clustering

In [29]:
model = AgglomerativeClustering(n_clusters=8)
labels = model.fit_predict(transformed_data)
silhouette_score(transformed_data, labels)

0.10076278748921833

## Birch

In [30]:
model = Birch(n_clusters=8)
labels = model.fit_predict(transformed_data)
silhouette_score(transformed_data, labels)

0.09712564858948314

# Hyper Parameter Tuning using GridSearchCV

### We select KMeans based on the silhouette score

In [58]:
params = {
    "n_clusters": list(range(10, 40, 5)),
    "algorithm": ["lloyd", "elkan"],
    "max_iter": list(range(300, 1000, 300)),
    "init": ["k-means++", "random"],
    "n_init": ["auto"]
}

def scorer(estimator, xtest):
    labels = estimator.predict(transformed_data)
    return calinski_harabasz_score(transformed_data, labels)

grid_search = GridSearchCV(KMeans(n_init="auto"), params, return_train_score=True, scoring=scorer)
grid_search.fit(transformed_data)
grid_search.best_estimator_

In [59]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_init,param_max_iter,param_n_clusters,param_n_init,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.022353,0.01546,0.004838,0.005103,lloyd,k-means++,300,10,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,79.528411,3.131586,12,80.523947,75.447857,84.058684,76.589436,81.022128,79.528411,3.131586
1,0.00238,0.000157,0.000669,1e-05,lloyd,k-means++,300,15,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,65.147851,2.776731,21,65.478941,66.397014,67.475835,59.742321,66.645145,65.147851,2.776731
2,0.006002,0.005984,0.000885,0.000195,lloyd,k-means++,300,20,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,55.179347,3.195881,35,51.541154,57.488797,57.276352,51.066561,58.523869,55.179347,3.195881
3,0.003277,0.000184,0.000857,9e-06,lloyd,k-means++,300,25,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,48.918606,2.349527,45,46.858293,52.246519,50.240276,45.728909,49.519036,48.918606,2.349527
4,0.003691,0.000194,0.000953,7e-06,lloyd,k-means++,300,30,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,44.972362,2.321233,49,43.756172,46.78024,46.994885,40.955513,46.375002,44.972362,2.321233


In [60]:
grid_search.best_score_

81.72129271018099