## Read in Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import seaborn as sns
from umap import UMAP

from sklearn.neighbors import NearestCentroid

from sklearn.preprocessing import MinMaxScaler

## Read In and Prepare Data

In [None]:
spotify = pd.read_csv('https://raw.githubusercontent.com/cjsyndergaard/project_486/main/data/spotify_data.csv')

In [None]:
# Notice that the same song has multiple track genres on different lines
spotify[spotify['artists'] == 'Jim Croce'][['track_id', 'artists', 'track_name', 'track_genre']]

In [None]:
# Keep only one genre per song
spotify = spotify.drop_duplicates(subset=['track_id'], keep='first', ignore_index=True)
x = spotify.iloc[:,5:20]
genre = spotify['track_genre']
genre.value_counts().plot.bar(figsize=(20, 4))

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, genre, random_state=307, test_size=.2)

## Initial Model

In [None]:
genre_rf = RandomForestClassifier(n_estimators=500, random_state=7567)

genre_rf.fit(xtrain, ytrain)
yhat = genre_rf.predict(xtest)
print(accuracy_score(yhat, ytest))
c =  confusion_matrix(ytest, yhat)
# Clearly, the correct genres are the most likely values, but it is very difficult.
sns.heatmap(c, annot=False)

## Super Genre vs Natural Clusters

In [None]:
# Super genres, chosen with research and domain knowledge
entertainment = ['anime', 'children','disney', 'kids', 'opera', 'show-tunes']
electronic = ['afrobeat', 'ambient', 'breakbeat', 'chicago-house', 'club', 'dance', 'dancehall',
              'deep-house', 'detroit-techno', 'disco', 'dub', 'dubstep', 'edm', 'electro', 'electronic',
              'funk', 'happy', 'house', 'idm',  'industrial', 'minimal-techno', 'progressive-house', 
              'techno', 'trance', 'drum-and-bass']
rock = ['alt-rock', 'black-metal', 'death-metal', 'emo', 'garage', 'goth', 'grindcore', 'groove',
        'grunge', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 'j-rock', 'metal', 'metalcore',
        'psych-rock', 'punk', 'punk-rock', 'rock', 'rock-n-roll']
pop = ['cantopop', 'hip-hop', 'indie', 'indie-pop', 'j-dance', 'j-idol', 'j-pop', 'k-pop',
       'mandopop', 'party', 'pop', 'pop-film', 'power-pop', 'synth-pop', 'trip-hop', 'alternative']
folk = ['acoustic', 'blues', 'folk', 'honky-tonk', 'jazz', 'r-n-b', 'singer-songwriter', 'soul',
        'bluegrass', 'country', 'guitar', 'rockabilly']
latin = ['latin', 'latino', 'brazil', 'forro', 'mpb', 'pagode', 'reggae', 'reggaeton', 'salsa', 'samba',
         'sertanejo', 'ska', 'spanish', 'tango']
foreign = [ 'british', 'french', 'german', 'indian', 'iranian', 'malay', 'swedish', 'turkish', 'world-music']
easy_listening = ['chill', 'classical', 'gospel', 'new-age', 'piano', 'romance', 'sad', 'sleep', 'study', 'comedy']
super_genres = {'entertainment':entertainment,
                'electronic':electronic,
                'rock':rock,
                'pop':pop,
                'folk':folk,
                'latin':latin,
                'foreign':foreign,
                'easy_listening':easy_listening}

In [None]:
def map_to_supergenre(genre_list, super_genre):
    for i in range(0, genre_list.size):
        for sg, glist in super_genre.items():
            if genre_list.iloc[i] in glist:
                genre_list.iloc[i] = sg
    return genre_list

ytrain_sup = map_to_supergenre(ytrain, super_genres)
ytest_sup = map_to_supergenre(ytest, super_genres)

In [None]:
## By cluster
from sklearn.cluster import KMeans
kmeans_per_k = [KMeans(n_clusters=k, n_init='auto', random_state=42).fit(x)
                for k in range(2, 15)]
inertias = [model.inertia_ for model in kmeans_per_k]
plt.figure(figsize=(10,4))
plt.plot(np.arange(2,15),inertias,marker="o")
plt.xlabel('Number of Clusters, K')
plt.ylabel('WCSS')
# 8 is a reasonable number of from the plot, and to make the best comparison with the chosen super-genre, we'll keep that
xtrain_clust, xtest_clust, ytrain_clust, ytest_clust = train_test_split(x, kmeans_per_k[6].labels_, random_state=307, test_size=.2)

In [None]:
## There are, however, some very uneven clusters, anomolous tracks.
np.unique(kmeans_per_k[6].labels_, return_counts=True)

## Models with Super Genre vs Natural Clusterings

### Random Forests

In [None]:
genre_rf2 = RandomForestClassifier(n_estimators=500, random_state=7567)
genre_rf2.fit(xtrain, ytrain_sup)
yhat2 = genre_rf2.predict(xtest)
c2 =  confusion_matrix(ytest_sup, yhat2)
print(accuracy_score(ytest_sup, yhat2))
plt.figure(figsize=(12, 5))
sns.heatmap(c2, annot=True)

In [None]:
genre_rf3 = RandomForestClassifier(n_estimators=500, random_state=7567)
genre_rf3.fit(xtrain, ytrain_clust)
yhat3 = genre_rf3.predict(xtest)
c3 =  confusion_matrix(ytest_clust, yhat3)
print(accuracy_score(ytest_clust, yhat3))
plt.figure(figsize=(12, 5))
sns.heatmap(c3, annot=True)

### Naive Bayes

In [None]:
## NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB().fit(xtrain, ytrain)
yhat_nb = nb.predict(xtest)
cnb =  confusion_matrix(ytest_sup, yhat_nb)
print(accuracy_score(ytest_sup, yhat_nb))

In [None]:
nb_clust = GaussianNB().fit(xtrain, ytrain_clust)
yhat_nb_clust = nb_clust.predict(xtest)
cnb_clust =  confusion_matrix(ytest_sup, yhat_nb)
print(accuracy_score(ytest_clust, yhat_nb_clust))

### Non-class Method: Nearest Centroid
This method attempts to perform like KNN without the high cost of prediction, namely by computing centroids of the groups. It turned out to be quite abysmal. 

In [None]:
nc = NearestCentroid().fit(xtrain, ytrain_sup)
yhat_nc = nc.predict(xtest)
c_nc =  confusion_matrix(ytest_sup, yhat_nc)
print(accuracy_score(ytest_sup, yhat_nc))

In [None]:
nc = NearestCentroid().fit(xtrain, ytrain_clust)
yhat_nc_clust = nc.predict(xtest)
c_nc_clust =  confusion_matrix(ytest_clust, yhat_nc_clust)
print(accuracy_score(ytest_clust, yhat_nc_clust))

## UMAP

In [None]:
from umap import UMAP
# Dimension reduction was attempted using UMAP, however for computational constraints 5 neighbors were used.
# I played with this for a very long time, and it never got any better.
umap = UMAP(n_neighbors=5)
um_5 = umap.fit_transform(x)
labels = kmeans_per_k[6].labels_.astype(int)
plt.scatter(um_5[:,0],um_5[:,1], c=labels)
plt.xlabel('Dim 1')
plt.ylabel('Dim 2')

## Visualizations and Explanations

In [None]:
num_vars = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
xvis = pd.DataFrame(MinMaxScaler().fit_transform(xtrain.loc[:,num_vars]))
xvis.columns = num_vars
xvis['genre'] = ytrain_sup
xvis['cluster'] = ytrain_clust
xvis = xvis.dropna()

In [None]:
gen = ['entertainment', 'latin', 'folk', 'electronic', 'foreign',
       'easy_listening', 'pop', 'rock']
summaries_gen = {}
for g in gen:
    summaries_gen[g] = xvis.loc[xvis['genre'] == g,num_vars].mean()

summaries_clust = {}
for cl in range(8):
    summaries_clust[cl] = xvis.loc[xvis['cluster'] == cl,num_vars].mean()

In [None]:
for g in gen:
    temp = pd.DataFrame(summaries_gen[g])
    temp.plot.bar(title=f'Super Genre "{g}" Attributes', legend=False)
    plt.savefig(f'figures/attributes/genre_{g}.png', bbox_inches = "tight")

for i in range(8):
    temp = pd.DataFrame(summaries_clust[i])
    temp.plot.bar(title=f'Cluster "{i}" Attributes', legend=False)
    plt.savefig(f'figures/attributes/cluster_{i}.png', bbox_inches = "tight")