# Song Recommender

## Goal

#### The goal of this script is to provide song recommendations to a user based on their input making use of a machine learning algorithm
#### The algorithm generates the recomendation based on one condition:
* if the inputed song is currently popular/trending, it suggests an also currently popular song. Otherwise it returns a  recommendation from a larger more generic pool of songs.
    * It implements an unsupervised learning algorithm that creates clusters of songs based on similarities in their audio features.
    * It then matches the inputed song with the most similar cluster in the data and recommends the closest element to the user.

## Data

#### The script gathers song data containing song names, artist names and several features of the audio track
* data is sourced through the Spotify API
* song choices are researched throughout the web and sourced from popular user generated and offical Spotify playlists:
    * popular charts and trending in social media (ex Billboard, Tiktok)
    * recently popular/emerging artists
    * top countries by music market share (ex Wikipedia article)
    * most popular music genres in current days
    * greatist hits of past decades and/or specific popular genres

In [1]:
#!pip install bs4

In [2]:
#!pip install spotipy

In [22]:
import warnings
warnings.filterwarnings('ignore')
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import classification_report
from sklearn.metrics import pairwise_distances_argmin_min

In [23]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="8b82ad952fa146cea44e222c46aae1e8",
                                                           client_secret="a2b6deb5dc3943a299811af9a65cc448"))

#### Helper functions - retrieve a dataframe containing song name, artist and audio features from a Spotify playlist URL

In [24]:
def get_df_from_playlist(playlist_id):
    
    results = sp.user_playlist_tracks("spotify", playlist_id)
    tracks = results['items']

    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    
    track_ids = [track['track']['uri'] for track in tracks]
    
    start = 0
    df = pd.DataFrame()

    for stop in range(0, len(track_ids)+100, 100):
        if start != stop:
            new_df = pd.json_normalize(sp.audio_features(track_ids[start:stop]))
            df = pd.concat([new_df, df])
            start = stop

    df = df.reset_index(drop=True)
    df.drop(['type', 'uri', 'track_href', 'analysis_url',], axis=1, inplace=True)
    
    main_artist_names = [track['track']['artists'][0]['name'] for track in tracks]
    song_names = [track['track']['name'] for track in tracks]
    df['main_artist'] = main_artist_names
    df['song_name'] = song_names

    return df

In [25]:
def get_df_from_multi_playlists(playlists):

    df = pd.DataFrame()

    for pl in playlists:

        df_ = get_df_from_playlist(pl)
        df = pd.concat([df,df_]).drop_duplicates().dropna().reset_index(drop=True)
        
        time.sleep(30)

    return df

## I. Sourcing song data

### Popular songs (charts, top countries)

In [16]:
#playlists featuring top charting songs

top_songs_playlists = [
    "6UeSakyzhiEt4NB3UAd6NQ", #official Billboard Hot 100 playlist
    "65LdqYCLcsV0lJoxpeQ6fW", #tiktok viral hits 2023 (286)

    "37i9dQZEVXbNG2KDcFcKOF", #Spotify top 50 Global
   "37i9dQZEVXbLp5XoPON0wI", #Spotify top 50 US
   "37i9dQZEVXbKqiTGXuCOsB", #Spotify top 50 Japan
   "37i9dQZEVXbMwmF30ppw50", #Spotify top 50 UK
   "37i9dQZEVXbK8BKKMArIyl", #Spotify top 50 Germany
    "37i9dQZEVXbIPWwFssbupI" #Spotify top 50 France
]

df_top_songs = get_df_from_multi_playlists(top_songs_playlists)
len(df_top_songs)

590

### Broader song picks (best ofs, decades, popular genres, emerging)

In [17]:
#playlist picks - greatest hits, new releases, genre, decade

picks_playlists = [
    "37i9dQZF1DX5Rc4HJf52tj", #new hip hop and r&b (100)
    "2fiMj3MyjrRCPS4hkUPt75", #beatport best new trap (100)
    "37i9dQZF1DX11otjJ7crqp", #new pop picks (106)
    "37i9dQZF1DWZryfp6NSvtz", #all new rock (50)
    "37i9dQZF1DX7Y7BqFok9IQ", #modern blues rock (150)
    "37i9dQZF1DWWrJKwf0q9nn", #hot new dance (83)
    "4ey270tjW7LT0S2scJJ5yI", #new house and bass 2023 (105)
    "37i9dQZF1DX44dZ4p5QLf4", #next wave soul (100)
    
    "5ABHKGoOzxkaa28ttQV9sE", #top 100 most streamed on spotify (all time)
    
    "37i9dQZF1DX1tz6EDao8it", #iconic soundtracks(82)
    "37i9dQZF1DXd9rSDyQguIk", #blues classics (70)
    "37i9dQZF1DWWvhKV4FBciw", #funk and soul classics (80)
    "37i9dQZF1DX04mASjTsvf0", #r&b classics (100)
    "4TJiZKy0vcEZa20ixnuQUh", #greatest hits electronic (47)
    "37i9dQZF1DWTU3Zl0elDUa", #90s house classics (60)
    "37i9dQZF1DX5qmTk3PVatJ", #greatest hiphop beats of all time (50)
    "37i9dQZF1DWXRqgorJj26U", #rock classics (200)
    "7AsZxdanrHJFNGp3q7tp1f", #best of pop 1990 2023 (1200)

    "37i9dQZF1DWTJ7xPn4vNaz", #all out 70s
    "37i9dQZF1DX4UtSsGT1Sbe", #all out 80s
    "37i9dQZF1DXbTxeAdrVG2l", #all out 90s
    "37i9dQZF1DX4o1oenSJRJd", #all out 2000s
    "37i9dQZF1DX5Ejj0EkURtP"  #all out 2010s
]

df_picks_songs = get_df_from_multi_playlists(picks_playlists)
len(df_picks_songs)

3603

In [94]:
df_top_songs.to_csv('top_songs.csv')
df_picks_songs.to_csv('picks_songs.csv')

In [None]:
#df_top_songs = pd.read_csv('top_songs.csv')
#df_picks_songs = pd.read_csv('picks_songs.csv')

In [26]:
df_all = pd.concat([df_picks_songs, df_top_songs]).drop_duplicates().dropna().reset_index(drop=True)   

## II. Implementing clusters

* Feature selection
* Scaling
* Clustering with k-means

#### Helper function - implements the algorithm and applies it to the inputed song, returns recommendation

In [91]:
def apply_scaler_and_clustering(df, track_id):
    
    cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    
    scaler = StandardScaler()
    x_prep = scaler.fit_transform(df[cols])

    kmeans = KMeans(n_clusters=3, random_state=42)
    kmeans.fit(x_prep)

    clusters = kmeans.predict(x_prep)

    scaled_df = pd.DataFrame(x_prep, columns=df[cols].columns)
    scaled_df['song_name'] = df['song_name']
    scaled_df['main_artist'] = df['main_artist']
    scaled_df['cluster'] = clusters
    
    #return scaled_df

       
    audio_features = sp.audio_features(track_id)
    df_ = pd.DataFrame(audio_features)
    new_features = df_[cols]
    scaled_x = scaler.transform(new_features)
    prediction = kmeans.predict(scaled_x)
    
    filtered_df = np.array(scaled_df[scaled_df['cluster'] == prediction[0]][x.columns], order="C")
    closest, _ = pairwise_distances_argmin_min(scaled_x, filtered_df)
    
    print('\n[RECOMMENDED SONG]')
    print(' - '.join([scaled_df.loc[closest]['song_name'].values[0], scaled_df.loc[closest]['main_artist'].values[0]]))
    print('\n=======================================')

## III. Input

#### Helper function - searches song by keyword, checks whether it is trending, then calls the recommendation algorithm passing in the song id

In [92]:
def get_song_recommendation(kws):
    
    results = sp.search(q=f'track:{kws}', limit=1)
    
    track_id = results['tracks']['items'][0]['id']
    track_name = results['tracks']['items'][0]['name']
    track_artist = results['tracks']['items'][0]['artists'][0]['name']
    
    print('=======================================\n')
    print('[SEARCHED SONG]')
    print(track_name, '-', track_artist)

    apply_scaler_and_clustering(df_top_songs if track_id in df_top_songs['id'].values else df_all, track_id)

#### test keywords

In [96]:
#keyword format: "song title" OR "song tile" + "artist"
test_kws = [
    'viva la vida',
    'sweet child',
    'fast car',
    'paint the town red',
    'its my life',
    'still dre'
]

for kw in test_kws:
    get_song_recommendation(kw)


[SEARCHED SONG]
Viva La Vida - Coldplay

[RECOMMENDED SONG]
The Flight To Neverland - From "Hook" - John Williams


[SEARCHED SONG]
Sweet Child O' Mine - Guns N' Roses

[RECOMMENDED SONG]
Blind - Reznikov


[SEARCHED SONG]
Fast Car - Luke Combs

[RECOMMENDED SONG]
Paint The Town Red - Doja Cat


[SEARCHED SONG]
Paint The Town Red - Doja Cat

[RECOMMENDED SONG]
Paint The Town Red - Doja Cat


[SEARCHED SONG]
It's My Life - Bon Jovi

[RECOMMENDED SONG]
Little Red Rooster - Willie Dixon


[SEARCHED SONG]
Still D.R.E. - Dr. Dre

[RECOMMENDED SONG]
Loca - Dubdogz



#### Input dialog

In [97]:
kw = input('Choose a song (or optionally a song + artist name): ')
print()
get_song_recommendation(kw)

Choose a song (or optionally a song + artist name): sweet child


[SEARCHED SONG]
Sweet Child O' Mine - Guns N' Roses

[RECOMMENDED SONG]
Blind - Reznikov

