# Model Evaluation
Evaluating will be made up of the following two testing metrics for the 3 models.
1. Silhouette Score
2. Genre Evaluation

Given that we were unable to train the hierarchical clustering model on the entire dataset (due to its relative inefficiency) we wll stick to evaluating the k-means and fuzzy c-means models.

In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import correlation
from tqdm.notebook import tqdm
import pickle
import os

# Load in Data

## Load song feature data 

In [4]:
data = pd.read_csv('data/dataset.csv', index_col=0)
data.drop_duplicates(subset='track_id', inplace=True)
features = ['track_id', 'track_name', 'artists', 'popularity', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()
X.drop_duplicates(subset=['track_name', 'artists'], keep='first', inplace=True)
X.reset_index(drop=True, inplace=True)  # fix indices after removing duplicates

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['popularity', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']])
cos_scaled = X_scaled / np.linalg.norm(X_scaled, axis=1)[:, np.newaxis]

In [5]:
data.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


# Load in Models

In [6]:
with open('models/eucd.pkl', 'rb') as file:
    euc_model = pickle.load(file)
with open('models/cos.pkl', 'rb') as file:
    cos_model = pickle.load(file)
with open('models/fuzzy.pkl', 'rb') as file:
    (cntr, u) = pickle.load(file)

In [7]:
results = X[['track_id', 'artists', 'track_name']].copy()
results['euc_cluster'] = euc_model.predict(X_scaled).astype(str)
results['cos_cluster'] = cos_model.predict(cos_scaled).astype(str)

## Create results matrix for the cosine and euclidean models

# 1. Compute Silhouette Scores 

# 2. Compare with User-made Playlists

## Load user playlist data

In [11]:
playlists['track_name'] = playlists['track_name'].str.replace(r"\(.*\)","", regex=True)

In [8]:
# Load first 1 million data points, excluding the user id
playlists = pd.read_csv('data/spotify_dataset.csv', on_bad_lines='skip', usecols=[' "artistname"', ' "trackname"', ' "playlistname"'])

# Rename columns
playlists.rename(columns={' "artistname"': 'artists', ' "trackname"': 'track_name', ' "playlistname"': 'playlist'}, inplace=True)

# Remove all parantheticals in song names to treat more songs as the same
playlists['track_name'].str.replace(r"\(.*\)","", regex=True)

KeyError: Index(['track_id'], dtype='object')

## Find all the playlists containing songs in the original playlist

In [12]:
mixed_data = pd.merge(X, playlists, how='left', on=['artists', 'track_name'])
mixed_data.drop_duplicates(subset=['track_id', 'playlist'], inplace=True)

## Evaluate models based on playlist prediction

In [28]:
len(mixed_data)

721561

In [75]:
evals = 10  # Number of songs to consider
k = 5  # Number of songs to predict based on the given playlist
p_length = 5  # Number of songs to pick from the playlist

valid_lists = mixed_data.groupby(by='playlist').filter(lambda g: len(g) >= p_length + k)
valid_playlists = valid_lists['playlist'].unique()

In [76]:
percentages = []
np.random.seed(42)
for _ in tqdm(range(evals)):
    # Sample random playlist from the possible playlists
    play = np.random.choice(valid_playlists)

    # Sample a random playlist
    total_playlist = valid_lists[valid_lists.playlist == play]

    # Sample `p_length` songs from that playlist
    playlist = valid_lists[valid_lists.playlist == play].sample(p_length).track_id

    # Look up those songs in the catelog based on their id and find their index
    playlist = results[results.track_id.isin(playlist)].index

    # Find the majority cluster of the songs in the playlist
    clusters = results.iloc[playlist]['cos_cluster']
    majority = np.argmax(np.bincount(clusters))

    # For each song in majority cluster, compute the correlation distance to each song in the playlist
    song_score = []
    indices = results.index # results[results['cos_cluster'] == str(majority).index
    for song in indices:
        score = float('inf')
        for liked_song in playlist:
            score = min(score, correlation(cos_scaled[song], cos_scaled[liked_song])) # min(score, np.mean((X_scaled[song] - X_scaled[liked_song])**2))
        song_score.append(score)

    # Find the `k` most similar songs based on the correlation distance
    most_similar = np.argpartition(song_score, kth=k+p_length)[:k+p_length]
    songs = results.iloc[indices[most_similar]]
    songs = songs[~songs.index.isin(playlist)][:k]

    # Compute the percentage of the songs in the user-made playlists
    inn = total_playlist[total_playlist.track_id.isin(songs.track_id)].count()
    # display(X.iloc[indices[most_similar]])
    # display(total_playlist)
    percentages.append(inn / (len(total_playlist) - p_length))
        

  0%|          | 0/10 [00:00<?, ?it/s]

In [72]:
percentages = []
np.random.seed(42)
for _ in tqdm(range(evals)):
    # Sample random playlist from the possible playlists
    play = np.random.choice(valid_playlists)

    # Sample a random playlist
    total_playlist = valid_lists[valid_lists.playlist == play]

    # Sample `p_length` songs from that playlist
    playlist = valid_lists[valid_lists.playlist == play].sample(p_length).track_id

    # Look up those songs in the catelog based on their id and find their index
    playlist = results[results.track_id.isin(playlist)].index

    indices = results.index
    
    # Compute song_score for each song in the same cluster
    song_score = []
    for song in indices:
        score = float('inf')
        for liked_song in playlist:
            mse = u[song] @ u[liked_song].T
            score = min(score, -mse)
        song_score.append(score)

    # Find the `k` most similar songs based on the correlation distance
    most_similar = np.argpartition(song_score, kth=k+p_length)[:k+p_length]
    songs = results.iloc[indices[most_similar]]
    songs = songs[~songs.index.isin(playlist)][:k]

    # Compute the percentage of the songs in the user-made playlists
    inn = total_playlist[total_playlist.track_id.isin(songs.track_id)].count()
    # display(X.iloc[indices[most_similar]])
    # display(total_playlist)
    percentages.append(inn / (len(total_playlist) - p_length))
        

  0%|          | 0/100 [00:00<?, ?it/s]

K-Means + Euclideai

In [68]:
np.mean(percentages)

0.001964285714285714

In [71]:
np.mean(percentages)

0.00125

In [73]:
np.mean(percentages)

0.0

In [77]:
np.mean(percentages)

0.0007462686567164178