In [None]:
##
##   Packages Used
##

import pandas as pd
import gdown
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as tk
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import numpy as np
from scipy.spatial import distance
from numpy.linalg import LinAlgError
from IPython.display import HTML

##
##   Data Upload
##

file_id = '1JjYmvA8qTPOh_dVAVkvsapP-xtes7F4h'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'spotify_songs.csv'
gdown.download(url, output, quiet=False)

songs = pd.read_csv('spotify_songs.csv')

##
##   Data Preprocessing
##

def standardize_date(dates):
    """
    Standardizes a list of date strings to the format 'YYYY-MM-DD'.

    Parameters:
        dates (iterable): An iterable containing date strings in various formats
                          ('YYYY', 'YYYY-MM', 'YYYY-MM-DD').

    Returns:
        pd.Series: A Pandas Series with dates converted to datetime format, where:
                   - 'YYYY' is converted to 'YYYY-01-01'
                   - 'YYYY-MM' is converted to 'YYYY-MM-01'
                   - 'YYYY-MM-DD' remains unchanged
                   Invalid dates will be set as NaT (Not a Time).
    """
    standardized_dates = []
    for date in dates:
        if pd.isna(date):
            standardized_dates.append(date)
        elif len(date) == 4:
            standardized_dates.append(f"{date}-01-01")
        elif len(date) == 7:
            standardized_dates.append(f"{date}-01")
        else:
            standardized_dates.append(date)

    return pd.to_datetime(standardized_dates, errors='coerce')


def preprocesse_songs(df):
    df.drop(columns=['playlist_name', 'playlist_id'], inplace=True)
    df.drop_duplicates(subset=['track_name','track_artist'], inplace=True)
    df = df[(df.duration_ms > df.duration_ms.quantile(0.01))]
    df.dropna(inplace=True)
    df['track_album_release_date'] = standardize_date(df['track_album_release_date'])
    df['release_year']  = df['track_album_release_date'].dt.year
    df = df.drop(columns=['track_album_release_date'])
    encoder = LabelEncoder()
    df['track_artist_label'] = encoder.fit_transform(df['track_artist'])
    df['track_album_id_label'] = encoder.fit_transform(df['track_album_id'])
    df['artist_track'] = df.apply(lambda x: f"{x['track_artist']} - {x['track_name']}", axis=1)
 
    return df

songs = preprocesse_songs(songs)


##
##   Clustering Process
##

clustering_data =  songs[['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo','track_artist_label','release_year']]



kmeans = KMeans(n_clusters=8)
songs.loc[:, 'kmeans_labels'] = kmeans.fit_predict(clustering_data)
clustering_data.loc[:, 'kmeans_labels'] = kmeans.fit_predict(clustering_data)

##
##   Prediction Process
##

## User Input - These should pprobably be drop down menus using the artist_track column as options and a few options of how many songs
## the user would like to be recommended as the output (e.g.: 10, 20, or 30)
song_name = input('Input the songs artist and song name as "Artist - Track":\n')
# Should we have a max here? 100 maybe?
top_n = int(input('How many songs would you like to be recommended?\n'))


user_input = songs[(songs.artist_track==song_name)]

num_user_input = clustering_data.loc[user_input.index]


like_songs = clustering_data[(clustering_data.kmeans_labels.values==num_user_input.kmeans_labels.values)]
like_songs = like_songs.drop(index=user_input.index)


# Calculate the covariance matrix
cov_matrix = np.cov(like_songs, rowvar=False)

try:
    # Inverse of the covariance matrix
    inv_cov_matrix = np.linalg.inv(cov_matrix)
except LinAlgError:
    # If the covariance matrix is singular (which means it's not inversible), compute the pseudoinverse
    inv_cov_matrix = np.linalg.pinv(cov_matrix)

# Function to find the "top_n" most similar songs using Mahalanobis distance
def find_top_similar_songs(songs_df, user_song, inv_cov_matrix, top_n=top_n):
    user_song = np.array(user_song.values.flatten())
    

    distances = {}
    for idx, song_features in songs_df.iterrows():
        song_features = np.array(song_features.values.flatten())
        # Calculate Mahalanobis distance between user song and current song
        distances[idx] = distance.mahalanobis(user_song, song_features, inv_cov_matrix)
    
    # Top N most similar songs by distance
    sorted_distances = sorted(distances.items(), key=lambda x: x[1])
    top_similar_indices = [idx for idx, _ in sorted_distances[:top_n]]
    
    top_songs = songs_df.loc[top_similar_indices]
    top_distances = [distances[idx] for idx in top_similar_indices]
    
    return top_songs, top_distances




top_songs, top_distances = find_top_similar_songs(like_songs, num_user_input, inv_cov_matrix, top_n=top_n)
recommended_tracks = songs[(songs.index.isin(top_songs.index))][['track_name','track_artist','track_album_name']]

print(recommended_tracks)

recommended_tracks = HTML(recommended_tracks.to_html(index=False))

In [None]:
clustering_data =  songs[['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo','track_artist_label','release_year']]

mood_musics = mood_musics[['track_id', 'track_name', 'track_artist', 'track_popularity', 
                                           'playlist_genre', 'playlist_subgenre', 'year', 'mood']]
    

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

class MusicRecommender:
    def __init__(self, songs_df):
        self.songs = self.preprocess_songs(songs_df)
        self.kmeans = None
        self.inv_cov_matrix = None
        self.clustering_data = self.prepare_clustering_data()
        self.train_kmeans()

    def preprocess_songs(self, df):
        
        df.drop(columns=['playlist_name', 'playlist_id'], inplace=True)
        df.drop_duplicates(subset=['track_name', 'track_artist'], inplace=True)
        df = df[df['duration_ms'] > df['duration_ms'].quantile(0.01)]
        df.dropna(inplace=True)
        df['release_year'] = pd.to_datetime(df['track_album_release_date'], errors='coerce').dt.year
        encoder = LabelEncoder()
        df['track_artist_label'] = encoder.fit_transform(df['track_artist'])
        df['track_album_id_label'] = encoder.fit_transform(df['track_album_id'])
        df['artist_track'] = df.apply(lambda x: f"{x['track_artist']} - {x['track_name']}", axis=1)
        return df

    def prepare_clustering_data(self):
        features = [
            'danceability', 'energy', 'key', 'loudness', 'mode',
            'speechiness', 'acousticness', 'instrumentalness',
            'liveness', 'valence', 'tempo', 'track_artist_label', 'release_year'
        ]
        return self.songs[features]

    def train_kmeans(self, n_clusters=8):
        self.kmeans = KMeans(n_clusters=n_clusters)
        self.songs['kmeans_labels'] = self.kmeans.fit_predict(self.clustering_data)
        self.clustering_data['kmeans_labels'] = self.kmeans.labels_
        self.calculate_cov_matrix()

    def calculate_cov_matrix(self):
        like_songs = self.clustering_data[self.clustering_data['kmeans_labels'] == self.clustering_data['kmeans_labels'][0]]
        cov_matrix = np.cov(like_songs, rowvar=False)
        try:
            self.inv_cov_matrix = np.linalg.inv(cov_matrix)
        except np.linalg.LinAlgError:
            self.inv_cov_matrix = np.linalg.pinv(cov_matrix)

    def recommend_by_song_name(self, song_name, top_n=10):
        user_input = self.songs[self.songs['artist_track'] == song_name]
        num_user_input = self.clustering_data.loc[user_input.index]
        like_songs = self.clustering_data[self.clustering_data['kmeans_labels'] == num_user_input['kmeans_labels'].values[0]]
        like_songs = like_songs.drop(index=user_input.index)
        return self.find_top_similar_songs(like_songs, num_user_input, top_n)

    def recommend_by_mood(self, mood, n=20):
        mood_musics = self.songs[self.songs['mood'] == mood].sort_values(by='track_popularity', ascending=False).head(300)
        mood_musics = mood_musics[['track_id', 'track_name', 'track_artist', 'track_popularity', 'playlist_genre', 'playlist_subgenre', 'year', 'mood']]
        unique_years = mood_musics['year'].nunique()
        songs_per_year = max(1, n // unique_years)
        sampled_musics = mood_musics.groupby('year').apply(lambda x: x.sample(min(len(x), songs_per_year))).reset_index(drop=True)
        if len(sampled_musics) < n:
            additional_songs = mood_musics.drop(sampled_musics.index).sample(n - len(sampled_musics))
            sampled_musics = pd.concat([sampled_musics, additional_songs]).reset_index(drop=True)
        return sampled_musics.sample(n).reset_index(drop=True)

    def recommend_by_characteristics(self, energy, danceability, speechiness, acousticness, top_n=10):
        inference_df = pd.DataFrame({
            'energy': [float(energy)], 'danceability': [float(danceability)],
            'speechiness': [float(speechiness)], 'acousticness': [float(acousticness)]
        })
        # You would call a clustering model here and make label predictions on inference_df
        # For now, this is a placeholder
        return "Step 1: Call Cluster model and make label predictions on inference_df"

    def find_top_similar_songs(self, songs_df, user_song, top_n=10):
        user_song = np.array(user_song.values.flatten())
        distances = {
            idx: distance.mahalanobis(user_song, np.array(song_features.values.flatten()), self.inv_cov_matrix)
            for idx, song_features in songs_df.iterrows()
        }
        sorted_distances = sorted(distances.items(), key=lambda x: x[1])
        top_similar_indices = [idx for idx, _ in sorted_distances[:top_n]]
        return self.songs.loc[top_similar_indices, ['track_name', 'track_artist', 'track_album_name']]


In [5]:
import gdown
import pandas as pd

In [6]:
# Load and preprocess the data
file_id = '1JjYmvA8qTPOh_dVAVkvsapP-xtes7F4h'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'spotify_songs.csv'
gdown.download(url, output, quiet=False)

songs = pd.read_csv('spotify_songs.csv')
songs

Downloading...
From: https://drive.google.com/uc?id=1JjYmvA8qTPOh_dVAVkvsapP-xtes7F4h
To: /Users/gabrielvictorgomesferreira/artificial_intelligence/isu_classes/projects/Spotify-Data-Project/code/final/spotify_songs.csv
100%|██████████| 7.97M/7.97M [00:00<00:00, 8.46MB/s]


Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,189052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32828,7bxnKAamR3snQ1VGLuVfC1,City Of Lights - Official Radio Edit,Lush & Simon,42,2azRoBBWEEEYhqV6sb7JrT,City Of Lights (Vocal Mix),2014-04-28,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,2,-1.814,1,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,204375
32829,5Aevni09Em4575077nkWHz,Closer - Sultan & Ned Shepard Remix,Tegan and Sara,20,6kD6KLxj7s8eCE3ABvAyf5,Closer Remixed,2013-03-08,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,0,-4.462,1,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,353120
32830,7ImMqPP3Q1yfUHvsdn7wEo,Sweet Surrender - Radio Edit,Starkillers,14,0ltWNSY9JgxoIZO4VzuCa6,Sweet Surrender (Radio Edit),2014-04-21,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,6,-4.899,0,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,210112
32831,2m69mhnfQ1Oq6lGtXuYhgX,Only For You - Maor Levi Remix,Mat Zo,15,1fGrOkHnHJcStl14zNx8Jy,Only For You (Remixes),2014-01-01,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,2,-3.361,1,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,367432


In [7]:
recommender = MusicRecommender(songs)

# Example usage

print(recommender.recommend_by_mood("Happy", n=20))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['release_year'] = pd.to_datetime(df['track_album_release_date'], errors='coerce').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['track_artist_label'] = encoder.fit_transform(df['track_artist'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
print(recommender.recommend_by_song_name("Artist - Track", top_n=10))