# Automatic Playlist Generation - Spotify Use Case

In [46]:
import spotipy
import requests
import spotipy.util as util
import pprint, json
import pandas as pd
import numpy as np

In this notebook, we're gonna generate playlist and exports it to Spotify. In the first part, playlists are generated from MSD tracks whereas in the second part, playlists are created using users' saved tracks. 

Let's define necessary variables for authentication and feature list taken from Spotify.

In [86]:
CLIENT_ID = '25e3cc7f42ad41d3b0a394e0658ec33a'
CLIENT_SECRET = '5ab7fd87ffb8417d8ce83cbfde8283ad'
scope = 'user-library-read playlist-modify user-read-private'
username = 'ezgitek'
PLAYLIST_NO = 0
feature_list = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "id"]
print("Size of feature list: ", str(len(feature_list)))

Size of feature list:  12


Let's implement functions necessary for getting data from spotify.

In [87]:
def get_token(username, scope, client_id, client_secret):
    return util.prompt_for_user_token(username, scope, client_id, client_secret, redirect_uri='http://localhost/')

def read_user_saved_tracks(sp):
    limit_of_track = 50
    offset_of_track = 0
    get_track = True
    audio_features = np.array(feature_list)
    while get_track:
        results = sp.current_user_saved_tracks(limit=limit_of_track, offset=offset_of_track)
        len_of_tracks = 0
        for item in results['items']:
            len_of_tracks += 1
            track = item['track']
            #print(track['id'] + " - " + track['name'])
            features_in_json = get_audio_features(sp, [track['id']])
            track_feature = [value for key, value in features_in_json[0].items()  if key in feature_list]
            audio_features = np.vstack([audio_features, track_feature])
        offset_of_track += len_of_tracks
        if len_of_tracks != 50:
            get_track = False
            print(offset_of_track)
    print("Total saved tracks " + str(offset_of_track))
    return audio_features

def get_audio_features(sp, track_id_list):
    return sp.audio_features(tracks=track_id_list)

def create_playlist_for_user(sp, username, playlist_name):
    return sp.user_playlist_create(username, playlist_name)["id"]

def add_tracks_to_playlist(sp, track_id_list, playlist_id):
    return sp.user_playlist_add_tracks(username, playlist_id, track_ids)

token = get_token(username, scope, CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(auth=token)



            User authentication requires interaction with your
            web browser. Once you enter your credentials and
            give authorization, you will be redirected to
            a url.  Paste that url you were directed to to
            complete the authorization.

        
Opened https://accounts.spotify.com/authorize?client_id=25e3cc7f42ad41d3b0a394e0658ec33a&response_type=code&redirect_uri=http%3A%2F%2Flocalhost%2F&scope=playlist-modify+user-library-read+user-read-private in your browser


Enter the URL you were redirected to: http://localhost/?code=AQAFu_t8KgRqBJqH1OO_yZO7FPJO4Fw8deOmSGJ7ogmuGs2dgRx7ZBvrRiROYrO_UrE6VfElOMouhz6o2SbCDU05-Ao0VMOujKPFh0uemNNbenS_uCHn-Lbd5G2G7d-luEsqBwqz3cAMDQVQ-hc54ARVpnY2bypuYPT5t-Go0y1UpmCm626trzM6IUL5y3zM0PhlsIjbZF9AGpwxMOBW4Z9VTlDhpQmVev9mA8fgX7j3FZGfEjLoeqVjt5jUDw




Here we're receiving users' saved tracks

In [88]:
user_saved_tracks = read_user_saved_tracks(sp)
user_saved_tracks

KeyboardInterrupt: 

In [54]:
def weighted_norm(arr1, arr2, ind=-1, weight=10):
    if ind != -1:
        arr1[ind] *= weight
        arr2[ind] *= weight
    return np.linalg.norm(arr1-arr2)

## Playlist Generation using MSD tracks

Below, a KMeans model is trained using MSD data

In [67]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.model_selection import train_test_split
input_data_path = "features_msd_lda_sp.csv"
NO_OF_CLUSTER = 100
dataset = pd.read_csv(input_data_path)
dataset.drop(["Unnamed: 0"], 1, inplace=True)
string_features = ["track_id", "id", "artist_name", "title"]
datacopy = dataset.copy()
datacopy.drop(string_features, 1, inplace = True)
genres = dataset.genre.unique()
for index, genre in zip(range(0, len(genres)),genres):
    datacopy.loc[datacopy['genre'] == genre, 'genre'] = index 
for column in datacopy.columns:
    max_of_column = datacopy[column].max()
    datacopy[column] = datacopy[column].apply(lambda x: x / max_of_column)
def create_input_for_clustering(dataset):
    genres = dataset.genre.unique()
    for index, genre in zip(range(0, len(genres)),genres):
        dataset.loc[dataset['genre'] == genre, 'genre'] = index 
        #dataset.drop(string_features, 1, inplace = True)
    return dataset.as_matrix()
X = create_input_for_clustering(datacopy.copy())
kmeans = KMeans(n_clusters=319)
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print("inertia:", inertia)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

inertia: 12023.8935989
Silhouette Coefficient: 0.125


Now, we're normalizing features of users' saved tracks. 

In [68]:
audio_features = user_saved_tracks
user_saved_tracks_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
audio_features = audio_features[:, :-1]
audio_feature_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
columns = list(dataset.columns.values)[-12:][:-1]
for column in columns:
    max_of_column = dataset[str(column)].max()
    if column.endswith('_y'):
        column = column[:-2]
    audio_feature_df[column] = pd.to_numeric(audio_feature_df[column], errors='coerce') / max_of_column
audio_features = audio_feature_df.as_matrix()

In [69]:
user_saved_tracks_df.head

<bound method NDFrame.head of     danceability energy key loudness mode speechiness acousticness  \
0          0.448  0.331   5   -9.003    1      0.0301         0.71   
1          0.385  0.485   7   -6.036    0      0.0522        0.423   
2           0.44  0.834  11   -7.391    0      0.0772        0.702   
3           0.28  0.368   2  -10.756    0      0.0354        0.761   
4          0.354  0.474   2   -6.295    1      0.0267         0.75   
5          0.285  0.491  10   -8.525    0      0.0331        0.839   
6          0.353  0.588   7   -7.805    0      0.0365        0.748   
7          0.745  0.466   4    -7.62    0      0.0864      0.00666   
8           0.42  0.929   7   -3.899    0       0.121     0.000353   
9            0.6  0.796   2   -4.481    1      0.0275     0.000109   
10         0.586  0.656   1   -5.694    0      0.0252       0.0969   
11         0.378  0.896   2   -4.066    1      0.0407     0.000452   
12          0.62  0.243   2  -13.545    1      0.0362       

Below, we're generating playlist from MSD tracks based on a seed song from users' saved tracks.

In [85]:
import random 
NUMBER_OF_PLAYLIST_GENERATED = 1
#random_track_indices = random.sample(range(audio_features.shape[0]), NUMBER_OF_PLAYLIST_GENERATED)
random_track_indices = [183]
test_input = audio_features.copy()[random_track_indices]
nones = np.zeros((test_input.shape[0], X.shape[1]-len(feature_list)+1))
test_input = np.append(nones, test_input, axis=1)

column_dict = {k: v for v, k in enumerate(list(dataset.drop(string_features, 1)))}
cluster_numbers = kmeans.predict(test_input)

for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([weighted_norm(X[ind], test_input[i]) for ind in indices])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(user_saved_tracks_df.iloc[random_track_ind]["id"])
    print ('======')
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])
    track_ids = dataset.iloc[indices]["id"].values.tolist()
    sp.trace = False
    playlist_name = "automatic-generated-playlist-" + str(PLAYLIST_NO)
    PLAYLIST_NO += 1
    playlists = sp.user_playlist_create(username, playlist_name)
    playlist_id = playlists["id"]
    results = sp.user_playlist_add_tracks(username, playlist_id, track_ids)

6cW9ndWeLeU9WUA9RZ49Md
3652                        It's Only Money by Thin Lizzy
3653                        Me And The Boys by Thin Lizzy
4865                   Rhinestone Cowboy by Glen Campbell
541                       All Or Nothing by Milli Vanilli
4653                             Burundanga by Celia Cruz
4752                        Ooh Baby Baby by The Miracles
5695                   Living In A Box by Living In A Box
5276                          Hound Dog by Little Richard
3160    Roll On Down The Highway by Bachman-Turner Ove...
3157                Down Down by Bachman-Turner Overdrive
dtype: object


## Playlist Generation using User's saved tracks

Below, we're gonna train KMeans model on user's saved tracks and generate playlist from these tracks based on a seed track.

In [59]:
audio_features = user_saved_tracks
user_saved_tracks_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
audio_features = audio_features[:, :-1]
audio_feature_df = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])
columns = list(dataset.columns.values)[-12:][:-1]
for column in columns:
    max_of_column = dataset[str(column)].max()
    if column.endswith('_y'):
        column = column[:-2]
    audio_feature_df[column] = pd.to_numeric(audio_feature_df[column], errors='coerce') / max_of_column
audio_features = audio_feature_df.as_matrix()
kmeans_user = KMeans(n_clusters=15)
kmeans_user = kmeans_user.fit(audio_features)
# Getting the cluster labels
labels = kmeans_user.predict(audio_features)

In [60]:
import random 
NUMBER_OF_PLAYLIST_GENERATED = 1
random_track_indices = random.sample(range(audio_features.shape[0]), NUMBER_OF_PLAYLIST_GENERATED)
test_input = audio_features.copy()[random_track_indices]
column_dict = {k: v for v, k in enumerate(list(feature_list))}
cluster_numbers = kmeans_user.predict(test_input)
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans_user.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([weighted_norm(audio_features[ind], test_input[i], column_dict["energy"]) for ind in indices])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(user_saved_tracks_df.iloc[random_track_ind]["id"])
    print ('======')
    track_ids = user_saved_tracks_df.iloc[indices]["id"].values.tolist()
    sp.trace = False
    print(track_ids)
    playlist_name = "automatic-generated-playlist-" + str(PLAYLIST_NO)
    PLAYLIST_NO += 1
    playlists = sp.user_playlist_create(username, playlist_name)
    playlist_id = playlists["id"]
    results = sp.user_playlist_add_tracks(username, playlist_id, track_ids)

1p79faae6M9lWpKyrKLNwD
['4dytZLx1iSu9tSDQUnJDHg', '2d7LPtieXdIYzf7yHPooWd', '3cBtANnJGopPaRMXCl3mV7', '5Qsunik4B1QMO6ZdxVroLJ', '1d8S23OZqfCXq5AT34ol0q', '06rR2oyiyLea7sCqRiI7em', '4BX2l7iMZ7OYSJDGwoODgh', '77dQzT15KNr5JTAynjSiMM', '2nlWKPk6RzV0n9a1OUeTCz', '4yRPGDmtIWZjhIPcEvPxsF']


In [61]:
import pickle
classifier = None
with open('classifier', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    classifier = pickle.load(f)

In [63]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
NUMBER_OF_PLAYLIST_GENERATED = 1
#random_track_indices = random.sample(range(audio_features.shape[0]), NUMBER_OF_PLAYLIST_GENERATED)
random_track_indices = [208]
test_input = audio_feature_df.as_matrix()[random_track_indices]
nones = np.zeros((test_input.shape[0], X.shape[1]-len(feature_list)+1))
test_input = np.append(nones, test_input[:,1:], axis=1)
genres = classifier.predict(test_input)
genres = [x/np.max(genres) for x in genres]
audio_copy = audio_feature_df.copy()
for ind, genre in zip(random_track_indices, genres):
    audio_copy.iloc[ind][0] = genre
test_input = audio_copy.as_matrix()[random_track_indices]
cluster_numbers = kmeans_user.predict(test_input)
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans_user.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([weighted_norm(audio_copy.iloc[ind], test_input[i]) for ind in indices])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(user_saved_tracks_df.iloc[random_track_ind]["id"])
    print ('======')
    track_ids = user_saved_tracks_df.iloc[indices]["id"].values.tolist()
    sp.trace = False
    print(track_ids)
    playlist_name = "automatic-generated-playlist-" + str(PLAYLIST_NO)
    PLAYLIST_NO += 1
    playlists = sp.user_playlist_create(username, playlist_name)
    playlist_id = playlists["id"]
    results = sp.user_playlist_add_tracks(username, playlist_id, track_ids)

2lAGG4iF7XsuqsG059t27S
['3WFB5CrA5pVu4IDyWM8jHN', '1AhDOtG9vPSOmsWgNW0BEY', '76CEXcx0OHPbq2cyiwjEHk', '6ICZQc8fr18dK37RN6R2YL', '771OYKR5EyyxaWGaZtCiGx', '78kk9wiapWNaTcA9Kzvhb0', '6E94HFcoPa1lOV0JCaul6v', '0LBua0mRC37pQzTVVUk3lB', '6cruljKBtHzEoydZd2lB0S', '2LricmvjlAmMgMYnqyWHRu']
