In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
import random
from IPython.display import IFrame

In [None]:
import config
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials

In [None]:
#Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret))

## importing song database

In [None]:
df = pd.read_csv("library.csv")
df

# functions - scaling & clustering

In [None]:
def standard_scaling_transform(DataFrame: pd.DataFrame) -> pd.DataFrame: 
    """takes a DataFrame as an input, transforms the data and returns a standardscaled DataFrame"""
    numeric_columns = DataFrame.select_dtypes(include=np.number).columns
    DataFrame[numeric_columns] = scaler.transform(DataFrame[numeric_columns])
    return DataFrame

In [None]:
def cluster_predict(DataFrame: pd.DataFrame) -> np.ndarray:
    """takes a DataFrame as an input, predicts the numeric columns and returns the clusters"""
    return kmeans.predict(DataFrame[DataFrame.select_dtypes(include=np.number).columns])

## scaling

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns

scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
scaled_df = df

## finding the best amount of clusters (K)

## Elbow Method

In [None]:
K = range(2, 21)
inertia = []

for k in K:
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans3 = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans3.fit(df[df.select_dtypes(include=np.number).columns])
    inertia.append(kmeans3.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

## Silhouette

In [None]:
K = range(2, 20)
silhouette = []

for k in K:
    kmeans4 = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans4.fit(df[df.select_dtypes(include=np.number).columns])
    
    filename = "kmeans_" + str(k) + ".pickle"
    with open(filename, "wb") as f:
        pickle.dump(kmeans4,f)
    
    silhouette.append(silhouette_score(df[df.select_dtypes(include=np.number).columns], kmeans4.predict(df[df.select_dtypes(include=np.number).columns])))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhouette Method showing the optimal k')

#### adjust the number of clusters in n_clusters

In [None]:
kmeans = KMeans(n_clusters=13, random_state=1234)

## fit and predict clusters

In [None]:
kmeans.fit(scaled_df[scaled_df.select_dtypes(include=np.number).columns])

In [None]:
labels = cluster_predict(scaled_df)

In [None]:
np.unique(labels)

In [None]:
# clusters overview
pd.Series(labels).value_counts().sort_index()

## adding cluster column to the database DataFrame

In [None]:
df["cluster"] = labels
pd.options.display.max_columns = 5 # reducing number of visible columns to see cluster column right away
df

In [None]:
#setting number of columns back to 20
pd.options.display.max_columns = 20 

## importing Top100 list

In [None]:
# import top100 DataFrame
df100 = pd.read_csv('top100.csv')
df100

## filtering if user_input is in Top100 or not

In [None]:
def check_top_100(df100: pd.DataFrame) -> list:
    
    # creating list of songs
    list_of_songs = list(df100['title'])
    lower_list = [x.lower() for x in list_of_songs]
    
    while True:
        title = input(f'enter a song title:\n ')
        
        # not in top100
        if (title.lower() not in lower_list):
            print(f'Your choice is not in the Top100.\nYou will now get a recommendation of a song based on "{title.title()}".')
            return [title.title(),0]
        
        # maybe in top100
        else:
            
            # creating list of artists
            df_mod = df100
            df_mod['title'] = df_mod['title'].str.lower()
            options = df_mod.loc[(df_mod['title'] == title.lower()),['artist']]
            artist_list = options['artist'].tolist()
            lower_artists = [x.lower() for x in artist_list]
            
            artist = input(f'\nYour song might be in the Top100. Please state which artist you are referring to: \n')
            
            # in top100
            if (artist.lower() in lower_artists):
                print('\nYour Song is in the Top100 :).\nYou will now get a recommendation of another song out of the Top100.')
                next_100 = random.choice(list_of_songs)
                
                #avoiding recommending the same song instead of another random one
                while next_100.lower() == title.lower():
                    next_100 = random.choice(list_of_songs)
                return [next_100.title(),1]
            
            # wrong artist, combination not in top100
            else:
                print(f'\nYour combination of song title "{title.title()}" and artist "{artist.title()}" is not in the Top 100.\nYou will now get a recommendation based on "{title.title()}".')
                return [title.title(),0]
                    

## finding the  track_id from user input

In [None]:
def find_track_id(song_title: list) -> str:
    search_results = sp.search(q=song_title[0],limit=3)
    track_id = search_results["tracks"]["items"][0]["id"]
    return track_id

## fetching audio features from a track_id and creating a dataframe

In [None]:
def audio_features(track_id: str) -> list:
    return sp.audio_features(track_id)

In [None]:
def features_df(audio_features: list) -> pd.DataFrame:
    return pd.DataFrame(data=audio_features)

## recommending a random track_id from the same cluster as the user_input cluster

In [None]:
def random_rec(library_df: pd.DataFrame ,cluster_number: np.ndarray) -> str:
    cluster_value = cluster_number[0]
    target_cluster = library_df.loc[(library_df['cluster'] == cluster_value),['id']]

    track_id_list = target_cluster['id'].tolist()
    return random.choice(track_id_list)

In [None]:
def play_song(track_id: str): # returns a displayable IFrame to play a song 
    return IFrame(src="https://open.spotify.com/embed/track/"+track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
      )

## finished song recommender

In [None]:
def song_recommender():
    song_choice = check_top_100(df100)
    if song_choice[1] == 0:
        track_ID = find_track_id(song_choice)
     
        user_song_cluster = cluster_predict(standard_scaling_transform(features_df(audio_features(track_ID))))
        display(play_song(random_rec(df,user_song_cluster)))
    else:
        display(play_song(find_track_id(song_choice)))

In [None]:
pd.options.display.max_rows = 100 # increasing visible rows so you can pick a song from the Top100 if you want to
df100



## Have fun :)

In [None]:
song_recommender()

In [None]:
pd.options.display.max_rows = 20 # decreasing visible rows back to normal