In [None]:
!pip install scikit-learn scipy matplotlib

Spotify Recommendations
Starting Off
A program that takes the available songs in Spotify's Database (Generated from the Kaggle dataset and Spotipy) and tries to recommend you songs based on an input of the title of the song and the year.

Abstract
Our goal was to create something that can actually recommend songs based on similiarities. We decided to do this because we both like music and we wanted to try something completely out of our league. We also wanted to try connecting it to Spotify so that any song suggestions could be played automatically.

Python/Jupyter
Working with large data
Machine Learning
[inspiration] (https://towardsdatascience.com/how-to-build-an-amazing-music-recommendation-system-4cce2719a572)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import os
%matplotlib inline

spotify_data = pd.read_csv('data.csv')
genre_data = pd.read_csv('data_by_genres.csv')
data_by_year = pd.read_csv('data_by_year.csv')

Sorting
Sorted our data (Kaggle) into specific features that are listed below. The plotly then generates a graph showing/comparing the values of songs that are organized by year to gaauge the change in music over time.

In [None]:
import plotly.express as px 

sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(data_by_year, x='year', y=sound_features)

fig.show()

In [None]:
import plotly.express as px

top10_genres = genre_data.nlargest(10, 'popularity')
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')

fig.show()

Clustering
What is Clustering?
Clustering is partitioning the data into their respective groups using the nearest mean/measure for a specific data. Essentially, it's a way to understand data.

Below splits the data into ten major clusters based on genre.

In [None]:

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, n_jobs=-1))])

X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

Visualizing Clusters
Using Scikit library, we can plot our clusters using the KMeans algorithm. The algorithm itself goes to cluster similar points of data while keeping any and all generated clusters away from each other as possible.

In [None]:
from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])
genre_embedding = tsne_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

In [None]:
import plotly.express as px 

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

Organizing by Song
Essentially the same thing, but this time we're doing it by song features that were established above. So then we are left with a cluster of song titles.

In [None]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2, n_jobs=4))],verbose=True)

X = spotify_data.select_dtypes(np.number)

number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
spotify_data['cluster_label'] = song_cluster_labels

In [None]:
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

In [None]:
import plotly.express as px

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])

fig.show()

Making The Recommendation
This is where, given an input of a song, it then checks if it exists in the dataset provided in Kaggle before checking with Spotipy. It then takes the features of the songs, go through the motions of the KMeans, before picking songs that exist in the same cluster as the ones you want.

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict


SPOTIFY_CLIENT_ID = "04536f9409444e5781d80838a0c2cf81"
SPOTIFY_SECRET = "34d3eafd43b146dca185549274ac29c6"
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID,
                                                           client_secret=SPOTIFY_SECRET))

def find_song(name, year):
  
    """
    This function returns a dataframe with data for a song given the name and release year.
    The function uses Spotipy to fetch audio features and metadata for the specified song.
    
    """
    
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,
                                                       year), limit=1)
    if results['tracks']['items'] == []:
        return None
    
    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    """
    This is taking all of the information about a song from Spotify based on the features that were listed above (i.e. acoustics, energy, etc.)
    """
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)

In [None]:
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict
        

def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [None]:
recommendations = recommend_songs([{'name': 'Driver\'s license', 'year':2021},
                {'name': 'Champagne Problems', 'year': 2020},
                {'name': 'August', 'year': 2020},
                {'name': 'Marjorie', 'year': 2020},
                {'name': 'The 1', 'year': 2020}],  spotify_data)

print(recommendations)

In [None]:
recommendations = recommend_songs([{'name': 'Crush', 'year':2018}],  spotify_data)

print(recommendations)

Conclusion
It's not the greatest, learning became more important than doing

Ended up being more of a tutorial/guide walkthrough
Not a large amount of data to work with; songs are limited in 100k
Couldn't quite figure out how to make a new app
Python...
VERSIONS ARE IMPORTANT