In [1]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.cluster import MeanShift

In [3]:
# Confidential info
keys = open("keys.txt", 'r').read().splitlines()

client_id = keys[0].split('=')[-1]
client_secret = keys[1].split('=')[-1]
user = 116885657
redirect_uri='http://localhost:8910/callback/'

In [4]:
scope = "user-library-read"
auth_manager = SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope=scope)
sp_read = spotipy.Spotify(auth_manager=auth_manager)

In [5]:
scope = "playlist-modify-public"
auth_manager = SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope=scope)
sp_write = spotipy.Spotify(auth_manager=auth_manager)

In [6]:
def get_playlist_tracks(username,playlist_id):
    results = sp_read.user_playlist_tracks(username,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp_read.next(results)
        tracks.extend(results['items'])
    return tracks

In [7]:
playlist = get_playlist_tracks('116885657', '6xDTuhbV6qQD1MGd4a7BhQ')
track_ids = [x['track']['id'] for x in playlist]

Couldn't read cache at: .cache


In [8]:
len(track_ids)

1094

In [None]:
data = []
for track_id in track_ids:
    features = sp_read.audio_features(track_id)[0]
    data.append(features)

In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
features = [
    'danceability',
    'energy',
    'loudness',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo'
]

In [None]:
fig, axs = plt.subplots(3,3, figsize=(16, 9), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5)

for feature in enumerate(features):
    index = feature[0]
    label = feature[1]
    plt.subplot(3,3,index + 1)
    plt.hist(df[label])
    plt.title(label);

In [None]:
distortions = []
X = df[features]

# Determine the optimal K clusters with the Elbow method

for i in range(1,10):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(df[features])
    kmeans.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 
                       'euclidean'),axis=1)) / X.shape[0])

In [None]:
# Semi-arbitrarily choose 4

fig, axs = plt.subplots(figsize=(8,5))
plt.xlabel("N clusters")
plt.ylabel("Euclidean distance metric")
plt.plot(distortions);

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(df[features])
kmeans.fit(X)
df_clustered = df
df_clustered['y_kmeans'] = kmeans.labels_

In [None]:
"""
Evaluation performance of unsupervised clustering algorithms 

Elbow method
Gap statistic
ref: https://towardsdatascience.com/clustering-evaluation-strategies-98a4006fcfc

Silhouette_score

How measure goodness of fit of unsupervised clustering model
https://stats.stackexchange.com/questions/21807/evaluation-measures-of-goodness-or-validity-of-clustering-without-having-truth
""";

In [None]:
def clusterDist(clusterColumn):

    N = len(df[clusterColumn].unique())
    x = np.arange(N)
    y = df.groupby(clusterColumn).count()['id']

    fig, axs = plt.subplots(figsize=(8,5))
    plt.bar(x,y)
    plt.xlabel('Clusters')
    plt.ylabel('N observations')
    plt.title('distribution of N obs. per cluster')
    return plt.show()

In [None]:
clusterDist('y_kmeans')

In [None]:
"""
TSNE is used for 3d cluster visualization
PCA could also be an option but seems to be less effective
""";

In [None]:
X_embedded = TSNE(n_components=3).fit_transform(X)
X_embedded.shape

In [None]:
reducedDimensions = ['TSNE1', 'TSNE2', 'TSNE3']
df[reducedDimensions] = X_embedded

In [None]:
fig = plt.figure(figsize=(24,13.5))
ax = fig.add_subplot(111, projection='3d')
x = df['TSNE1']
y = df['TSNE2']
z = df['TSNE3']
ax.scatter(x,y,z, s=40, c=df['y_kmeans'])
plt.show()

In [None]:
print (f"K-means silhouette coefficient: {metrics.silhouette_score(X, df['y_kmeans'])}")

In [None]:
from sklearn.cluster import AffinityPropagation

In [None]:
APmodel = AffinityPropagation(random_state=5, 
                              verbose=True, 
                              max_iter=5000, 
                              convergence_iter=150)

APclustering = APmodel.fit(X)
df['y_AP'] = APclustering.labels_

In [None]:
print (f"Affinity propagation yields {np.max(APclustering.labels_) + 1} clusters")
print (f"and silhouette coefficient: {metrics.silhouette_score(X, APclustering.labels_)}")

In [None]:
clusterDist('y_AP')

In [None]:
fig = plt.figure(figsize=(24,13.5))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x,y,z, s=40, c=df['y_AP'])
plt.show()

In [None]:
meanShiftModel = MeanShift(n_jobs=-1)
meanShiftClustering = meanShiftModel.fit(X)

In [None]:
print (f"Mean shift yields {np.max(meanShiftClustering.labels_) + 1} clusters")
print (f"and silhouette coefficient: {metrics.silhouette_score(X, meanShiftClustering.labels_)}")

In [None]:
df['y_meanShift'] = meanShiftClustering.labels_
df['y_meanShift'] = df['y_meanShift'].astype(str)

In [None]:
clusterDist('y_meanShift')

In [None]:
fig = px.scatter_3d(df[reducedDimensions + ['y_meanShift'] + ['id']], 
                    x='TSNE1',
                    y='TSNE2',
                    z='TSNE3',
                    color='y_meanShift')

fig.update_traces(marker=dict(size=3,
                              line=dict(width=2,
                              color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

In [None]:
# Lookup all track ID's in mean shift cluster 0 
clusterZeroTrackList = list(df[df['y_meanShift'] == '0']['id'])
# Show sample
clusterZeroTrackList[0:5]

In [None]:
# Create playlist
#sp_write.user_playlist_create(user, "test")

In [None]:
#sp_write.playlist_add_items('2ZuE2v5gIshSQGWH2x3xVo', items=clusterZeroTrackList[0:100])

In [None]:
for i in range(batches):
    print (i)