In [1]:
import numpy as np
import pandas as pd
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.cluster import KMeans
import random
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score


In [2]:
def get_track_uri_tuples(df): 
    """
    input:
    df- dataframe of playlist from 'extract' tracks

    output- removes duplicates of tracks and then returns all track uris of that df
    
    """
    df_final = df.drop_duplicates(['track_uri'])
    track_uri_tuple = [(x,y,z,a) for x,y,z,a in zip(df_final['artist_name'], df_final['track_name'], df_final['track_uri'], df_final['artist_uri'])]
    return track_uri_tuple

def get_track_uri(track_uri_tuple):
    """
    input:
    track_uri_tuple - list of track_uris_tuples (artist name, song, track_uris)

    output- just track_uris in same order for spotipy
    
    """

    track_uris = [x[2] for x in track_uri_tuple]
    return track_uris


#track_uris = get_track_uri(track_uri_tuple)

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id='911b425bc8c3496297b0a66d3df0b236', client_secret='REMOVED FOR PRIVACY')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
def get_track_uri(track_uri_tuple):
    """
    input:
    track_uri_tuple - list of track_uris_tuples (artist name, song, track_uris)

    output- just track_uris in same order for spotipy
    
    """

    track_uris = [x[2] for x in track_uri_tuple]
    return track_uris


def drop_string(df):
    """
    Drops the non-integer and non-float type columns for fitting to KNN model.
    Returns the formatted DataFrame.
    
    input: 
    df (DataFrame) - 
    
    output:
    Formatted dataframe of the playlist
    """

    #also drops unnamed which is redudancy from creating new dataframes from indices
    un_list = [x for x in list(df.columns) if "Unnamed" in x]
    df = df.drop(un_list, axis = 1)
    return df.select_dtypes(exclude=['string', 'object'])


def find_artist_uri(artist_name, song_name):
    """
    Takes the artist_name and finds its corresponding artist_uri
    
    input: 
    artist_name - name of desired artist
    
    output:
    artist_uri = uri of desired artist
    
    """
    result = sp.search(artist_name + " " + song_name)
    return result['tracks']['items'][0]['artists'][0]['uri']

def find_song_uri(song_name, artist_uri):
    """
    Takes the song_name and finds its corresponding song_uri
    
    input: 
    song_name - name of desired song
    artist_uri - URI of desired artist
    
    output:
    song_uri = uri of desired song
    
    """
    result = sp.search(song_name)
    for i in range(len(result['tracks']['items'])):

        poss_auri = result['tracks']['items'][i]['artists'][0]['uri']

        if artist_uri == poss_auri:
            print("Artist found through Spotipy search.")
            return result['tracks']['items'][i]['uri']
    
    print("Artist not found through Spotipy search.")
    return result['tracks']['items'][0]['uri']

def find_tuple(track_name, artist_name):
    """
    Creates a track URI tuple for input into the get_audio_features method.
    
    input: 
    song_name = name of desired song
    artist_name = name of desired artist
    
    output:
    track_uri_tuple = tuple of (artist_name, track_name, track_uri, artist_uri)
    
    """
    artist_uri = find_artist_uri(artist_name, track_name)
    track_uri = find_song_uri(track_name, artist_uri)

    return (artist_name, track_name, track_uri, artist_uri)


def process_song_input(track_uri_tuple, training_df):
    """
    Formats the track_uri_tuple such that it can be used as input. inthe Kmeans model

    """
    get_track_uri(track_uri_tuple)
    df_s = get_audio_features(track_uri_tuple) 
    df_s = add_pop_rd_exp(df_s) 
    df_s = add_genres(df_s,training_df)

    return df_s

def get_audio_features(track_uri_tuple):
    """
    input: track_uri_tuple
    output: a dataframe with all audio_features #is this for only one song?

    """
    track_uri = track_uri_tuple[2]
    audio_features_dict = {}

    af = sp.audio_features(tracks = [track_uri])
    audio_features_dict[track_uri_tuple] = af[0]
    xf = pd.DataFrame.from_dict(audio_features_dict, 'index')
    df_final = xf.reset_index().rename(columns={"level_0": "Artist Name", "level_1": "Song Name", "level_3": "aUri"}).drop('level_2', axis =1)

    return df_final

def add_pop_rd_exp(df):
    """
    input: df without pop,rd,exp
    output: df with pop, rd,exp
    """

    current_track = sp.tracks(tracks= df['uri'])['tracks'][0]

    rd_list = []
    pop_list = []
    exp_list = []

    if current_track is None:
        release_date = "No Release Date"
        pop = "No Popularity"
        exp = "No Information"
    else:
        exp = current_track['explicit']
        if current_track['album']['release_date'] is not None:
            release_date = current_track['album']['release_date']
        else:
            release_date = "No Release Date"
        if current_track['popularity'] is not None:
            pop = current_track['popularity']
        else:
            release_date = "No Popularity"
    rd_list+= [release_date]
    pop_list.append(pop)
    exp_list.append(exp)
    df['popularity'] = pop_list
    df['release_date'] = rd_list
    df['is_explicit'] = exp_list
    return df


def add_genres(df, training_df):
    """
    training_df = whatever df is being used to train for kMeans
    output: df with genres
    """
    #or whatever csv training is

    list_of_genres = list(training_df.columns[34:]) #indices might be off depending if unnamed was dropped or not

    #get genres from artist uri
    auri = df['aUri'][0]
    song_genres = sp.artist(artist_id = auri)['genres']
    df['genres'] = [song_genres]

    for genre_to_check in list_of_genres:
        bool_list = []

        if genre_to_check in song_genres:
            bool_list += [1]
        else:
            bool_list += [0]

        df[genre_to_check] = bool_list
 
    return df

In [5]:
def build_playlist_from_song(song_name, artist_name, df):
    
    song_df = df.copy()
    # get tuple
    track_uri_tuple = find_tuple(song_name, artist_name)
    # format in terms of the row in song_df --> use spotipy api / Ferdie's code
    song_input = process_song_input(track_uri_tuple, song_df)

    formatted_input = drop_string(song_input)

    formatted_df = drop_string(song_df)

    k = find_best_k(formatted_df)

    kmeans = KMeans(n_clusters = k, random_state = 42)
    kmeans.fit(formatted_df)

    #score = silhouette_score(formatted_df, kmeans.labels_)
    #print("Silhouette Score:", score)

    song_df['kmeans'] = kmeans.labels_

    cluster_of_choice = kmeans.predict(formatted_input)

    # Get dataframe with all songs from predicted cluster for input
    songs_in_cluster = song_df[song_df['kmeans'] == cluster_of_choice[0]].drop(columns= "kmeans")

    # Finds euclidean distance from input to other values
    new_df = drop_string(songs_in_cluster).drop(columns= "is_explicit") # using .head() for quick testing

    eu_distances = new_df.apply(lambda x: np.linalg.norm(x - formatted_input.drop(columns="is_explicit")), axis = 1)

    songs_in_cluster['eu_distances'] = eu_distances
    # Sort on euclidean distance, then take first n = 2 songs
    df_final = songs_in_cluster.sort_values('eu_distances')

    return df_final.iloc[:2, :][["Song Name", "Artist Name", 'eu_distances']]

from sklearn.cluster import KMeans

def find_best_k(df):

    ### Finding best K's
    max_diff = -999999
    best_k = dict()

    avg_sum_squares = []

    # calculation method
    for k in range(1, 11):
        kmeans = KMeans(n_clusters = k, random_state = 42)
        kmeans.fit(df)

        avg_sum_squares += [kmeans.inertia_]

        if k == 1:
            prev_inertia = kmeans.inertia_

        else:
            slope = prev_inertia - kmeans.inertia_
            prev_inertia = kmeans.inertia_

            max_diff = max(max_diff, slope)

            if max_diff <= slope:
                best_k[max_diff] = k

    #plt.scatter(range(1, 11), avg_sum_squares)
    #plt.title("Elbow Method for Optimal K")
    #print("Calculated Best K:", best_k[max_diff])

    k = best_k[max_diff]
    return k


In [6]:
# reload df

In [7]:
n = 92814 #number of records in file
s = 30000 #desired sample size

random.seed(42)

filename = "final_spotify_example.csv"
skip = sorted(random.sample(range(n),n-s + 1))
song_df = pd.read_csv(filename, skiprows=skip)

song_df

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1,Artist Name,...,big band,appalachian folk,blues,funky breaks,rock nacional,canzone napoletana,jazz blues,pagode baiano,indie folk argentino,cumbia ranchera
0,1,1,1,1,1,1,1,1,1,Frank Sinatra,...,0,0,0,0,0,0,0,0,0,0
1,7,7,7,7,7,7,7,7,7,Louis Armstrong,...,0,0,0,0,0,0,0,0,0,0
2,9,9,9,9,9,9,9,9,9,Nina Simone,...,0,0,0,0,0,0,1,0,0,0
3,12,12,12,12,12,12,12,12,12,The Marvelettes,...,0,0,0,0,0,0,0,0,0,0
4,17,17,17,17,17,17,17,17,17,James Brown & The Famous Flames,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,92807,92807,92807,92807,92807,92807,92807,92807,92807,Justin Bieber,...,0,0,0,0,0,0,0,0,0,0
29996,92810,92810,92810,92810,92810,92810,92810,92810,92810,Taylor Swift,...,0,0,0,0,0,0,0,0,0,0
29997,92812,92812,92812,92812,92812,92812,92812,92812,92812,Miley Cyrus,...,0,0,0,0,0,0,0,0,0,0
29998,92813,92813,92813,92813,92813,92813,92813,92813,92813,Why Don't We,...,0,0,0,0,0,0,0,0,0,0


In [None]:
playlist = build_playlist_from_song("More Than Survive", "Will Roland", song_df) #not sure why it didnt run
playlist

In [58]:
def build_playlist_from_songs(list_lists, df, num_songs):
    
    # Reset df
    song_df = df.copy()
    
    # Remember to normalize this time!
    formatted_df = drop_string(song_df)
    formatted_df = pd.DataFrame(columns=formatted_df.columns, data = preprocessing.normalize(formatted_df, norm='l2'))


    k = find_best_k(formatted_df)

    kmeans = KMeans(n_clusters = k, random_state = 42)
    kmeans.fit(formatted_df)
    
    score = silhouette_score(formatted_df, kmeans.labels_)
    print("Silhouette Score:", score)
    
    final_playlist = pd.DataFrame()
    for pair in list_lists:
        song_df = df.copy()
        
        # get tuple
        track_uri_tuple = find_tuple(pair[0], pair[1])
        print(track_uri_tuple)

        # format in terms of the row in song_df --> use spotipy api / Ferdie's code
        song_input = process_song_input(track_uri_tuple, song_df)

        formatted_input = drop_string(song_input)  
        formatted_input = pd.DataFrame(columns=formatted_input.columns, data = preprocessing.normalize(formatted_input, norm='l2'))


        
        extra_cols = set(formatted_df.columns).symmetric_difference(set(formatted_input.columns))
        if len(extra_cols) > 0:
            formatted_input.drop(columns = list(extra_cols), inplace = True)
        
        cluster_of_choice = kmeans.predict(formatted_input)
        
        song_df['kmeans'] = kmeans.labels_
        
        song_df = song_df[song_df['Artist Name'] != pair[1]]

        # Get dataframe with all songs from predicted cluster for input
        songs_in_cluster = song_df[song_df['kmeans'] == cluster_of_choice[0]].drop(columns= "kmeans")

        # Finds euclidean distance from input to other values
        new_df = drop_string(songs_in_cluster).drop(columns= "is_explicit", errors = "ignore") # using .head() for quick testing
        new_df = pd.DataFrame(columns=new_df.columns, data = preprocessing.normalize(new_df, norm='l2'))


        eu_distances = new_df.apply(lambda x: np.linalg.norm(x - formatted_input.drop(columns="is_explicit", errors = "ignore")), axis = 1)

        songs_in_cluster['eu_distances'] = eu_distances
        # Sort on euclidean distance, then take first n = 2 songs
        df_final = songs_in_cluster.sort_values('eu_distances')
                                     
        final_playlist = final_playlist.append(df_final.iloc[:num_songs, :][["Song Name", "Artist Name", 'eu_distances']])

    return final_playlist

In [None]:
### FINAL PROCESS

In [37]:
song_list = [["Dancing Queen", "ABBA"], ["Just for a Moment", "Olivia Rodrigo"], ["Burning Up", "BTS"]]

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [59]:
import time

start = time.time()

final_playlist = build_playlist_from_songs(song_list, song_df, 3)

end = time.time()
print("Total time: ", end - start)

final_playlist

Silhouette Score: 0.6026498036403066
Artist found through Spotipy search.
('ABBA', 'Dancing Queen', 'spotify:track:0GjEhVFGZW8afUYGChu3Rr', 'spotify:artist:0LcJLqbBmaGUft1e9Mm8HV')
Artist found through Spotipy search.
('Olivia Rodrigo', 'Just for a Moment', 'spotify:track:5Yc2A0YtK6PaXT8J1MQ0Rd', 'spotify:artist:1McMsnEElThX1knmY4oliG')
Artist found through Spotipy search.
('BTS', 'Burning Up', 'spotify:track:4z9gBZQjGS4QLb4LOvmeuA', 'spotify:artist:3Nrfpe0tUJi4K4DXYWgMUX')
Total time:  400.34415793418884


Unnamed: 0,Song Name,Artist Name,eu_distances
17877,Blue Bell Knoll,Cocteau Twins,7e-06
22421,By the Sea,Johnny Depp,9e-06
23779,Fuck Me For Free,Akinyele,9e-06
3461,Unfailing Love,Chris Tomlin,9e-06
20796,Finders Keepers,Gnucci,1e-05
23820,The Chalice Passed,Howard Shore,1e-05
3246,Darte un Beso,Prince Royce,8e-06
16662,Nowhere Fast,Josh Turner,1e-05
10751,Aunque Te Duela,Fidel Rueda,1.1e-05


In [47]:
from sklearn import preprocessing
### Step by Step Process of build_playlist_from_songs([["Love Story", "Taylor Swift"]], song_df, 2)
# get tuple
track_uri_tuple = find_tuple("Love Story", "Taylor Swift")
# format in terms of the row in song_df --> use spotipy api / Ferdie's code
song_input = process_song_input(track_uri_tuple, song_df)

formatted_input = drop_string(song_input)
formatted_input

Artist found through Spotipy search.


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,big band,appalachian folk,blues,funky breaks,rock nacional,canzone napoletana,jazz blues,pagode baiano,indie folk argentino,cumbia ranchera
0,0.618,0.741,2,-3.95,1,0.031,0.17,0,0.0822,0.296,...,0,0,0,0,0,0,0,0,0,0


In [48]:
from sklearn import preprocessing
formatted_input = pd.DataFrame(columns=formatted_input.columns, data = preprocessing.normalize(formatted_input, norm='l2'))
formatted_input

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,big band,appalachian folk,blues,funky breaks,rock nacional,canzone napoletana,jazz blues,pagode baiano,indie folk argentino,cumbia ranchera
0,3e-06,3e-06,9e-06,-1.7e-05,4e-06,1.317652e-07,7.225832e-07,0.0,3.493902e-07,1e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
dfo = song_df.copy()
formatted_df = drop_string(dfo)
formatted_df = pd.DataFrame(columns=formatted_df.columns, data = preprocessing.normalize(formatted_df, norm='l2'))
kmeans = KMeans(n_clusters = 2, random_state = 42)
kmeans.fit(preprocessing.normalize(formatted_df, norm='l2'))

KMeans(n_clusters=2, random_state=42)

In [50]:
formatted_input = formatted_input.drop(columns = list(set(formatted_df.columns).symmetric_difference(set(formatted_input.columns))), errors = "ignore")
kmeans.predict(formatted_input)

array([0], dtype=int32)

In [51]:
dfo = song_df.copy()

In [52]:
dfo['Cluster Labels'] = kmeans.labels_

In [53]:
songs_in_cluster = dfo[dfo['Cluster Labels'] == 0].drop(columns= "Cluster Labels", errors = "ignore")
songs_in_cluster['Song Name']

1          La Vie En Rose - Single Version
2                             Feeling Good
4                  I Got You (I Feel Good)
5         I Heard It Through The Grapevine
7                               Lean on Me
                       ...                
29995                     Overboard (Live)
29996    State Of Grace - Acoustic Version
29997              My Heart Beats For Love
29998                      I Depend On You
29999                          Turn It Off
Name: Song Name, Length: 24745, dtype: object

In [56]:
new_df = drop_string(songs_in_cluster) # using .head() for quick testing
new_df = pd.DataFrame(columns=new_df.columns, data = preprocessing.normalize(new_df, norm='l2'))
list(set(new_df.columns).symmetric_difference(set(formatted_input.columns)))

[]

In [57]:
eu_distances = new_df.apply(lambda x: np.linalg.norm(x - formatted_input), axis = 1)

songs_in_cluster['eu_distances'] = eu_distances
# Sort on euclidean distance, then take first n = 2 songs
df_final = songs_in_cluster.sort_values('eu_distances')

df_final.iloc[:2, :][["Song Name", "Artist Name", 'eu_distances']]

Unnamed: 0,Song Name,Artist Name,eu_distances
3578,Wide Awake (feat. Kenzie May),Vindata,8e-06
1403,Too Much To Ask,Niall Horan,8e-06


In [9]:
song_list = [["BRB", "Mahalia"], ["Waveform", "Pinegrove"], ["I'm Still Standing", "Elton John"]]

In [None]:
our_playlist = build_playlist_from_songs(song_list, song_df, 4)

Silhouette Score: 0.6026498036403066
Artist found through Spotipy search.
('ABBA', 'Dancing Queen', 'spotify:track:0GjEhVFGZW8afUYGChu3Rr', 'spotify:artist:0LcJLqbBmaGUft1e9Mm8HV')


In [None]:
our_playlist