# Automatic Playlist Generation - Spotify Use Case

In [107]:
import spotipy
import requests
import spotipy.util as util
import pprint, json
import pandas as pd
import numpy as np

In this notebook, we're gonna generate playlist and exports it to Spotify. In the first part, playlists are generated from MSD tracks whereas in the second part, playlists are created using users' saved tracks. 

Let's define necessary variables for authentication and feature list taken from Spotify.

In [108]:
CLIENT_ID = '25e3cc7f42ad41d3b0a394e0658ec33a'
CLIENT_SECRET = '5ab7fd87ffb8417d8ce83cbfde8283ad'
scope = 'user-library-read playlist-modify user-read-private'
username = '11131118133'
PLAYLIST_NO = 0
feature_list = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "id"]
print("Size of feature list: ", str(len(feature_list)))

Size of feature list:  12


Let's implement functions necessary for getting data from spotify.

In [109]:
def get_token(username, scope, client_id, client_secret):
    return util.prompt_for_user_token(username, scope, client_id, client_secret, redirect_uri='http://localhost/')

def read_user_saved_tracks(sp):
    limit_of_track = 50
    offset_of_track = 0
    get_track = True
    audio_features = np.array(feature_list)
    while get_track:
        results = sp.current_user_saved_tracks(limit=limit_of_track, offset=offset_of_track)
        len_of_tracks = 0
        for item in results['items']:
            len_of_tracks += 1
            track = item['track']
            #print(track['id'] + " - " + track['name'])
            features_in_json = get_audio_features(sp, [track['id']])
            track_feature = [value for key, value in features_in_json[0].items()  if key in feature_list]
            audio_features = np.vstack([audio_features, track_feature])
        offset_of_track += len_of_tracks
        if len_of_tracks != 50:
            get_track = False
    print("Total saved tracks " + str(offset_of_track))
    return audio_features

def get_audio_features(sp, track_id_list):
    return sp.audio_features(tracks=track_id_list)

def create_playlist_for_user(sp, username, playlist_name):
    return sp.user_playlist_create(username, playlist_name)["id"]

def add_tracks_to_playlist(sp, track_id_list, playlist_id):
    return sp.user_playlist_add_tracks(username, playlist_id, track_ids)

token = get_token(username, scope, CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(auth=token)

Here we're receiving users' saved tracks

In [110]:
user_saved_tracks = read_user_saved_tracks(sp)
user_saved_tracks

Total saved tracks 267


array([['danceability', 'energy', 'key', ..., 'valence', 'tempo', 'id'],
       ['0.448', '0.331', '5', ..., '0.549', '120.373',
        '05CS4vtvc5hbbofZyo3xhB'],
       ['0.385', '0.485', '7', ..., '0.228', '119.836',
        '6a9BbEiz6gO2EqLnrSAKwA'],
       ..., 
       ['0.663', '0.494', '2', ..., '0.531', '119.872',
        '6a0BcOBXGJ9gr5MZ189oRG'],
       ['0.582', '0.186', '7', ..., '0.612', '103.095',
        '6sOTwzUE73fU6OFNRtYDi3'],
       ['0.557', '0.266', '2', ..., '0.373', '144.811',
        '795qPRGb0VFJFVrt28MkQo']],
      dtype='<U32')

## Playlist Generation using MSD tracks

Below, a KMeans model is trained using MSD data

In [19]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.model_selection import train_test_split
input_data_path = "features_msd_lda_sp.csv"
NO_OF_CLUSTER = 100
dataset = pd.read_csv(input_data_path)
dataset.drop(["Unnamed: 0"], 1, inplace=True)
string_features = ["track_id", "id", "artist_name", "title"]
datacopy = dataset.copy()
datacopy.drop(string_features, 1, inplace = True)
genres = dataset.genre.unique()
for index, genre in zip(range(0, len(genres)),genres):
    datacopy.loc[datacopy['genre'] == genre, 'genre'] = index 
for column in datacopy.columns:
    max_of_column = datacopy[column].max()
    datacopy[column] = datacopy[column].apply(lambda x: x / max_of_column)
def create_input_for_clustering(dataset):
    genres = dataset.genre.unique()
    for index, genre in zip(range(0, len(genres)),genres):
        dataset.loc[dataset['genre'] == genre, 'genre'] = index 
        #dataset.drop(string_features, 1, inplace = True)
    return dataset.as_matrix()
X = create_input_for_clustering(datacopy.copy())
kmeans = KMeans(n_clusters=319)
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print("inertia:", inertia)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

inertia: 12006.5968641
Silhouette Coefficient: 0.125


Now, we're normalizing features of users' saved tracks. 

In [81]:
audio_features = user_saved_tracks
user_saved_tracks_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
audio_features = audio_features[:, :-1]
audio_feature_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
columns = list(dataset.columns.values)[-12:][:-1]
for column in columns:
    max_of_column = dataset[str(column)].max()
    if column.endswith('_y'):
        column = column[:-2]
    audio_feature_df[column] = pd.to_numeric(audio_feature_df[column], errors='coerce') / max_of_column
audio_features = audio_feature_df.as_matrix()

In [111]:
user_saved_tracks_df.head

<bound method NDFrame.head of     danceability energy key loudness mode speechiness acousticness  \
0          0.448  0.331   5   -9.003    1      0.0301         0.71   
1          0.385  0.485   7   -6.036    0      0.0522        0.423   
2           0.44  0.834  11   -7.391    0      0.0772        0.702   
3           0.28  0.368   2  -10.756    0      0.0354        0.761   
4          0.354  0.474   2   -6.295    1      0.0267         0.75   
5          0.285  0.491  10   -8.525    0      0.0331        0.839   
6          0.353  0.588   7   -7.805    0      0.0365        0.748   
7          0.745  0.466   4    -7.62    0      0.0864      0.00666   
8           0.42  0.929   7   -3.899    0       0.121     0.000353   
9            0.6  0.796   2   -4.481    1      0.0275     0.000109   
10         0.586  0.656   1   -5.694    0      0.0252       0.0969   
11         0.378  0.896   2   -4.066    1      0.0407     0.000452   
12          0.62  0.243   2  -13.545    1      0.0362       

Below, we're generating playlist from MSD tracks based on a seed song from users' saved tracks.

In [92]:
import random 
NUMBER_OF_PLAYLIST_GENERATED = 1
#random_track_indices = random.sample(range(audio_features.shape[0]), NUMBER_OF_PLAYLIST_GENERATED)
random_track_indices = [0]
test_input = audio_features.copy()[random_track_indices]
nones = np.zeros((test_input.shape[0], X.shape[1]-len(feature_list)+1))
test_input = np.append(nones, test_input, axis=1)

cluster_numbers = kmeans.predict(test_input)
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([np.linalg.norm(X[ind]-test_input[i]) for ind in indices])
    #print(dataset.loc[index]["title"] + " by " + dataset.loc[index]["artist_name"])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(user_saved_tracks_df.iloc[random_track_ind]["id"])
    print ('======')
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])
    track_ids = dataset.iloc[indices]["id"].values.tolist()
    sp.trace = False
    playlist_name = "automatic-generated-playlist-" + str(PLAYLIST_NO)
    PLAYLIST_NO += 1
    playlists = sp.user_playlist_create(username, playlist_name)
    playlist_id = playlists["id"]
    results = sp.user_playlist_add_tracks(username, playlist_id, track_ids)

05CS4vtvc5hbbofZyo3xhB
3195    Midnight Moses by The Sensational Alex Harvey ...
1332               Straight Shootin' Woman by Steppenwolf
4067                       Excited by The Doobie Brothers
5280                       Ooh! My Soul by Little Richard
2265                              Alimony by Diamond Head
5334                             Stone In Love by Journey
954     Hold Back The Night by Graham Parker & The Rumour
5327               Lovin'_ Touchin'_ Squeezin' by Journey
5335                           Keep On Runnin' by Journey
2214               (Ain't That) Good News by The Supremes
dtype: object


## Playlist Generation using User's saved tracks

Below, we're gonna train KMeans model on user's saved tracks and generate playlist from these tracks based on a seed track.

In [103]:
audio_features = user_saved_tracks
user_saved_tracks_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
audio_features = audio_features[:, :-1]
audio_feature_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
columns = list(dataset.columns.values)[-12:][:-1]
for column in columns:
    max_of_column = dataset[str(column)].max()
    if column.endswith('_y'):
        column = column[:-2]
    audio_feature_df[column] = pd.to_numeric(audio_feature_df[column], errors='coerce') / max_of_column
audio_features = audio_feature_df.as_matrix()
kmeans = KMeans(n_clusters=15)
kmeans = kmeans.fit(audio_features)
# Getting the cluster labels
labels = kmeans.predict(audio_features)
import random 
NUMBER_OF_PLAYLIST_GENERATED = 1
#random_track_indices = random.sample(range(audio_features.shape[0]), NUMBER_OF_PLAYLIST_GENERATED)
random_track_indices = [254]
test_input = audio_features.copy()[random_track_indices]

cluster_numbers = kmeans.predict(test_input)
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([np.linalg.norm(audio_features[ind]-test_input[i]) for ind in indices])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(user_saved_tracks_df.iloc[random_track_ind]["id"])
    print ('======')
    track_ids = user_saved_tracks_df.iloc[indices]["id"].values.tolist()
    sp.trace = False
    print(track_ids)
    playlist_name = "automatic-generated-playlist-" + str(PLAYLIST_NO)
    PLAYLIST_NO += 1
    playlists = sp.user_playlist_create(username, playlist_name)
    playlist_id = playlists["id"]
    results = sp.user_playlist_add_tracks(username, playlist_id, track_ids)

0K4qgS2SOYhANcdLL8pybq
['4TPDtuazf64oXjPBSsKwSi', '5n6RDaGFSN88oRWuGtYAIN', '7DTx1sy8roG04BMg2huGOL', '3m6KkYKdnbffMpGd9Pm9FP', '0o9MIPIGM8svRpfdQGTvUX', '7jasmsYWsjrsXTIDI5RApZ', '73jUiSSBWsnr6jGQFdqfYi', '4l0XGK3I5NyZH7h9xltqpG', '1KbWV3TG4Jv5tEbROpyq98', '3DpXWapy0LD3ZnLpy7gPgI']
