# **Music Recommendation**
## Artificial Intelligence Course


# **Import Libraries**

In [22]:
#Main
import os
import numpy as np
import pandas as pd

#Visualization
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

#SkLearn
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

# **Read Data**

In [23]:
data = pd.read_csv("data.csv")

# **EDA**

# **Characteristics of Different Genres**

Groupby and rank by popularity

In [24]:
genre_data = data.groupby(['genre']).mean()

In [25]:
genre_data

Unnamed: 0_level_0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A Capella,9.302521,0.829941,0.412252,204467.697479,0.250313,0.007491,0.136924,-13.660387,0.042414,111.51895,0.328724
Alternative,50.21343,0.162313,0.541898,233241.364245,0.711519,0.061303,0.196985,-6.540803,0.088783,122.534485,0.44959
Anime,24.258729,0.286843,0.47209,229937.067927,0.665356,0.280592,0.192391,-7.917802,0.065102,126.629156,0.441682
Blues,34.742879,0.32784,0.528232,251931.826,0.606171,0.095175,0.233125,-9.053807,0.061809,121.137637,0.579425
Children's Music,4.252637,0.591781,0.697475,142726.7742,0.39688,0.143572,0.164894,-11.64176,0.117279,121.131288,0.675956
Children’s Music,54.65904,0.163175,0.541843,232554.443601,0.706616,0.05434,0.195015,-6.526668,0.086489,121.931104,0.449237
Classical,29.282195,0.868843,0.305958,310339.983578,0.177984,0.599425,0.16281,-21.544477,0.052001,104.341807,0.214463
Comedy,21.34263,0.793098,0.559038,235305.210102,0.676094,0.000574,0.724775,-11.689321,0.853532,98.235488,0.412764
Country,46.100416,0.270172,0.577038,217237.527008,0.636318,0.00561,0.187216,-7.341693,0.048989,123.414419,0.53516
Dance,57.275256,0.152888,0.638191,226264.941846,0.698067,0.035449,0.187753,-6.054241,0.083608,120.795919,0.517754


In [26]:
top10_genres = genre_data.nlargest(10, 'popularity')
top10_genres
# Add bar plot genre vs popularity
# Add bar plot top 10 genres and 4 chars (popularity, acoust, dance, energy)

Unnamed: 0_level_0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Pop,66.590667,0.224819,0.640236,220859.160239,0.642208,0.016599,0.179967,-6.495423,0.107963,121.175844,0.481371
Rap,60.533795,0.16808,0.697244,219853.858102,0.65052,0.009317,0.198939,-6.669916,0.188186,121.100808,0.455918
Rock,59.619392,0.196429,0.538292,237003.22692,0.68367,0.053288,0.186981,-7.285875,0.053664,122.62963,0.517113
Hip-Hop,58.423131,0.176172,0.718808,219981.913179,0.643275,0.0112,0.201146,-6.860286,0.205396,120.791039,0.473381
Dance,57.275256,0.152888,0.638191,226264.941846,0.698067,0.035449,0.187753,-6.054241,0.083608,120.795919,0.517754
Indie,54.701561,0.331214,0.566821,224150.373991,0.581002,0.085317,0.168919,-7.915142,0.066724,119.290814,0.428665
Children’s Music,54.65904,0.163175,0.541843,232554.443601,0.706616,0.05434,0.195015,-6.526668,0.086489,121.931104,0.449237
R&B,52.308719,0.288216,0.642125,225748.130227,0.564248,0.025558,0.17535,-7.597064,0.120994,116.373834,0.450346
Alternative,50.21343,0.162313,0.541898,233241.364245,0.711519,0.061303,0.196985,-6.540803,0.088783,122.534485,0.44959
Folk,49.940209,0.463201,0.527276,235805.456608,0.491733,0.084934,0.170773,-9.870282,0.045077,118.748882,0.440237


# **Clustering Genres with K-Means**

Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [27]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [28]:
scaled_features = MinMaxScaler().fit_transform([
  data['acousticness'].values,
  data['danceability'].values,
  data['duration_ms'].values,
  data['energy'].values,
  data['instrumentalness'].values,
  data['liveness'].values,
  data['loudness'].values,
  data['speechiness'].values,
  data['tempo'].values,
  data['valence'].values,
  ])

In [29]:
data[['acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']] = scaled_features.T

In [30]:
genre_n = pd.get_dummies(data['genre'])
key_n = pd.get_dummies(data['key'])

In [31]:
data = data.drop('genre',axis = 1)
data = data.drop('artist_name', axis = 1)
data = data.drop('track_name', axis = 1)
data = data.drop('popularity',axis = 1)
data = data.drop('key', axis = 1)
data = data.drop('mode', axis = 1)
data = data.drop('time_signature', axis = 1)

In [32]:
data = data.join(genre_n)
data = data.join(key_n)

In [33]:
data

Unnamed: 0,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,B,C,C#,D,D#,E,F,F#,G,G#
0,0BRjO6ga9RKCKjfDqeFgWV,0.000025,0.000022,1.0,0.000028,0.000018,0.000022,0.0,0.000019,0.001699,...,0,0,1,0,0,0,0,0,0,0
1,0BjC1NfoEOOusryehmNudP,0.000042,0.000045,1.0,0.000046,0.000040,0.000042,0.0,0.000041,0.001307,...,0,0,0,0,0,0,0,1,0,0
2,0CoSDzoNIKCRs124s9uTVy,0.000087,0.000085,1.0,0.000082,0.000082,0.000082,0.0,0.000082,0.000666,...,0,1,0,0,0,0,0,0,0,0
3,0Gc6TVm52BwZD07Ki6tIvf,0.000084,0.000081,1.0,0.000082,0.000080,0.000081,0.0,0.000080,0.001207,...,0,0,1,0,0,0,0,0,0,0
4,0IuslXpMROHdEPvSl1fTQK,0.000267,0.000260,1.0,0.000259,0.000257,0.000258,0.0,0.000256,0.001957,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,2XGLdVl7lGeq8ksM6Al7jT,0.000033,0.000035,1.0,0.000035,0.000034,0.000033,0.0,0.000033,0.000387,...,0,0,0,1,0,0,0,0,0,0
232721,1qWZdkBl4UVPj9lK6HuuFM,0.000025,0.000027,1.0,0.000027,0.000025,0.000025,0.0,0.000025,0.000428,...,0,0,0,0,0,1,0,0,0,0
232722,2ziWXUmQLrXTiYjCg2fZ2t,0.000055,0.000053,1.0,0.000052,0.000050,0.000050,0.0,0.000050,0.000554,...,0,0,0,1,0,0,0,0,0,0
232723,6EFsue2YbIG4Qkq8Zr9Rir,0.000033,0.000035,1.0,0.000035,0.000032,0.000034,0.0,0.000033,0.000482,...,0,0,0,0,0,0,0,0,0,0


# **Build Recommender System**

* Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
* This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
* [Spotipy](https://spotipy.readthedocs.io/en/2.16.1/) is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using `pip install spotipy`
* After installing Spotipy, you will need to create an app on the [Spotify Developer’s page](https://developer.spotify.com/) and save your Client ID and secret key.

In [14]:
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.19.0-py3-none-any.whl (27 kB)
Installing collected packages: spotipy
Successfully installed spotipy-2.19.0


In [17]:
client_id = "1659166c086249a09f1dedffe685d4a3"
client_secret= "d4aa95e596574ac3aa0b6de9270d2990"

In [35]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

In [39]:
#Fetching the playlist
scope = 'user-library-read'
token = util.prompt_for_user_token(
    scope, 
    client_id= client_id, 
    client_secret=client_secret, 
    redirect_uri='http://localhost:8881/callback'
  )
sp = spotipy.Spotify(auth=token)

In [40]:
#Fetching the playlist
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [43]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'acousticness', 'danceability', 'duration_ms', 'energy','instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [44]:
recommend_songs([{'name': 'Come As You Are'},
                {'name': 'Smells Like Teen Spirit'},
                {'name': 'Lithium'},
                {'name': 'All Apologies'},
                {'name': 'Stay Away'}],  data)

KeyError: 'name'

* This last cell will gives you a recommendation list of songs like this,


```
[{'name': 'Life is a Highway - From "Cars"',
  'year': 2009,
  'artists': "['Rascal Flatts']"},
 {'name': 'Of Wolf And Man', 'year': 1991, 'artists': "['Metallica']"},
 {'name': 'Somebody Like You', 'year': 2002, 'artists': "['Keith Urban']"},
 {'name': 'Kayleigh', 'year': 1992, 'artists': "['Marillion']"},
 {'name': 'Little Secrets', 'year': 2009, 'artists': "['Passion Pit']"},
 {'name': 'No Excuses', 'year': 1994, 'artists': "['Alice In Chains']"},
 {'name': 'Corazón Mágico', 'year': 1995, 'artists': "['Los Fugitivos']"},
 {'name': 'If Today Was Your Last Day',
  'year': 2008,
  'artists': "['Nickelback']"},
 {'name': "Let's Get Rocked", 'year': 1992, 'artists': "['Def Leppard']"},
 {'name': "Breakfast At Tiffany's",
  'year': 1995,
  'artists': "['Deep Blue Something']"}]
```



* You can change the given songs list as per your choice.