In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv("../data/top10s_final.csv", index_col=0)
dtypes = df.dtypes
numerical_cols = list(dtypes[dtypes != 'object'][1:].index)
categorical_cols = list(dtypes[dtypes == "object"].index) + ["year"]
df.shape

(603, 14)

In [4]:
df.head()

Unnamed: 0,title,artist,genre,year,tempo,energy,danceability,loudness,liveness,valence,length,acousticness,speechiness,popularity
1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
2,Love The Way You Lie,Eminem,hip hop,2010,87,93,75,-5,52,64,263,24,23,82
3,TiK ToK,Kesha,pop,2010,120,84,76,-3,29,71,200,10,14,80
4,Bad Romance,Lady Gaga,pop,2010,119,92,70,-4,8,71,295,0,4,79
5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [5]:
len(df.title.unique()) + len(df.artist.unique()) + len(df.genre.unique())

788

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603 entries, 1 to 603
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         603 non-null    object
 1   artist        603 non-null    object
 2   genre         603 non-null    object
 3   year          603 non-null    int64 
 4   tempo         603 non-null    int64 
 5   energy        603 non-null    int64 
 6   danceability  603 non-null    int64 
 7   loudness      603 non-null    int64 
 8   liveness      603 non-null    int64 
 9   valence       603 non-null    int64 
 10  length        603 non-null    int64 
 11  acousticness  603 non-null    int64 
 12  speechiness   603 non-null    int64 
 13  popularity    603 non-null    int64 
dtypes: int64(11), object(3)
memory usage: 70.7+ KB


There will be additional **789** features after OneHotEncoding these three features

In [7]:
df

Unnamed: 0,title,artist,genre,year,tempo,energy,danceability,loudness,liveness,valence,length,acousticness,speechiness,popularity
1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
2,Love The Way You Lie,Eminem,hip hop,2010,87,93,75,-5,52,64,263,24,23,82
3,TiK ToK,Kesha,pop,2010,120,84,76,-3,29,71,200,10,14,80
4,Bad Romance,Lady Gaga,pop,2010,119,92,70,-4,8,71,295,0,4,79
5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,Find U Again (feat. Camila Cabello),Mark Ronson,pop,2019,104,66,61,-7,20,16,176,1,3,75
600,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,pop,2019,95,79,75,-6,7,61,206,21,12,75
601,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,pop,2019,136,76,53,-5,9,65,260,7,34,70
602,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,pop,2019,114,79,60,-6,42,24,217,1,7,69


In [8]:
sorted_years = df.year.sort_values().unique()
preprocessor = ColumnTransformer([
    ("scaler", StandardScaler(), list(range(4, 14))),
    ("ohe", OneHotEncoder(sparse_output=False), [2]),
    ("ordinal", OrdinalEncoder(categories=[sorted_years]), [3])
], remainder="drop")
preprocessor.fit(df.values)

In [9]:
transformed_data = preprocessor.transform(df.values)
transformed_data.shape

(603, 31)

# Model Selection

## KMeans

In [10]:
model = KMeans(n_clusters=8, n_init="auto")
model.fit(transformed_data)
labels = model.predict(transformed_data)
silhouette_score(transformed_data, labels)

0.11762835305202332

## Agglomerative Clustering

In [11]:
model = AgglomerativeClustering(n_clusters=8)
labels = model.fit_predict(transformed_data)
silhouette_score(transformed_data, labels)

0.09242900825709757

## Birch

In [12]:
model = Birch(n_clusters=8)
labels = model.fit_predict(transformed_data)
silhouette_score(transformed_data, labels)

0.0940256960795925

# Hyper Parameter Tuning using GridSearchCV

### We select KMeans based on the silhouette score

In [13]:
params = {
    "n_clusters": list(range(10, 40, 5)),
    "algorithm": ["lloyd", "elkan"],
    "max_iter": list(range(300, 1000, 300)),
    "init": ["k-means++", "random"],
    "n_init": ["auto"]
}

def scorer(estimator, xtest):
    labels = estimator.predict(transformed_data)
    return silhouette_score(transformed_data, labels)

grid_search = GridSearchCV(KMeans(n_init="auto"), params, return_train_score=True, scoring=scorer)
grid_search.fit(transformed_data)
grid_search.best_estimator_

In [14]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_init,param_max_iter,param_n_clusters,param_n_init,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.08162,0.020301,0.022077,0.012881,lloyd,k-means++,300,10,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,0.105789,0.003294,16,0.100266,0.106482,0.104806,0.107048,0.110341,0.105789,0.003294
1,0.067277,0.007718,0.029264,0.013018,lloyd,k-means++,300,15,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,0.104792,0.011924,17,0.112237,0.095763,0.093135,0.098176,0.124651,0.104792,0.011924
2,0.086102,0.023509,0.024244,0.008659,lloyd,k-means++,300,20,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,0.086718,0.005017,60,0.084987,0.08938,0.089345,0.077778,0.092098,0.086718,0.005017
3,0.086186,0.019632,0.026416,0.011514,lloyd,k-means++,300,25,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,0.087389,0.008323,58,0.078842,0.093214,0.076667,0.08998,0.098239,0.087389,0.008323
4,0.119317,0.029957,0.018518,0.009593,lloyd,k-means++,300,30,auto,"{'algorithm': 'lloyd', 'init': 'k-means++', 'm...",...,0.088257,0.00402,52,0.081519,0.093502,0.090137,0.089527,0.086599,0.088257,0.00402


In [15]:
grid_search.best_score_

0.11920968078688739

In [16]:
preprocessor

In [17]:
final_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", grid_search.best_estimator_),
])
final_model

In [18]:
df.head().iloc[0]

title           Hey, Soul Sister
artist                     Train
genre                 neo mellow
year                        2010
tempo                         97
energy                        89
danceability                  67
loudness                      -4
liveness                       8
valence                       80
length                       217
acousticness                  19
speechiness                    4
popularity                    83
Name: 1, dtype: object

In [19]:
final_model.predict([df.head().iloc[0].values])[0]

2

In [20]:
df["clusters"] = final_model.predict(df.values)
df

Unnamed: 0,title,artist,genre,year,tempo,energy,danceability,loudness,liveness,valence,length,acousticness,speechiness,popularity,clusters
1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83,2
2,Love The Way You Lie,Eminem,hip hop,2010,87,93,75,-5,52,64,263,24,23,82,4
3,TiK ToK,Kesha,pop,2010,120,84,76,-3,29,71,200,10,14,80,2
4,Bad Romance,Lady Gaga,pop,2010,119,92,70,-4,8,71,295,0,4,79,2
5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,Find U Again (feat. Camila Cabello),Mark Ronson,pop,2019,104,66,61,-7,20,16,176,1,3,75,1
600,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,pop,2019,95,79,75,-6,7,61,206,21,12,75,3
601,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,pop,2019,136,76,53,-5,9,65,260,7,34,70,1
602,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,pop,2019,114,79,60,-6,42,24,217,1,7,69,1


In [21]:
df.shape

(603, 15)

# Recommender Class

In [22]:
class SongRecommender:
    def __init__(self, model):
        self.model = model
        self.data = None
    
    def fit(self, data):
        self.data = data
        self.model.fit(data.values)
        self.data["clusters"] = self.model.predict(data.values)
    
    def get_cluster(self, user_data):
        return self.model.predict([user_data])[0]
    
    def get_k_recommendations(self, user_data, k):
        cluster = self.get_cluster(user_data)
        similar_songs = self.data[self.data["clusters"] == cluster].copy()
        
        # drop the song entered by user from the similar songs
        index = similar_songs[(similar_songs["title"] == user_data["title"]) & 
                             (similar_songs["artist"] == user_data["artist"])]
        if not index.empty:
            index = index.index[0]
            similar_songs.drop(index, inplace=True)
        
        return similar_songs.sample(k).index.tolist()
    

user_data = df.iloc[12]
# Data Preprocessor
sorted_years = df.year.sort_values().unique()
preprocessor = ColumnTransformer([
    ("scaler", StandardScaler(), list(range(4, 14))),
    ("ohe", OneHotEncoder(sparse_output=False), [2]),
    ("ordinal", OrdinalEncoder(categories=[sorted_years]), [3])
], remainder="drop")
# preprocessor.fit(df.values)

# Best model from grid search
model = KMeans(init='random', max_iter=900, n_clusters=10, n_init='auto')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model),
])

recommender = SongRecommender(pipeline)
recommender.fit(df)
recommender.get_k_recommendations(user_data, 6)

[76, 188, 96, 16, 102, 201]

In [23]:
user_data

title           Cooler Than Me - Single Mix
artist                          Mike Posner
genre                                   pop
year                                   2010
tempo                                   130
energy                                   82
danceability                             77
loudness                                 -5
liveness                                 70
valence                                  63
length                                  213
acousticness                             18
speechiness                               5
popularity                               73
clusters                                  4
Name: 13, dtype: object