In [1]:
import pandas as pd
import numpy as np
import gdown
import json
import sys
import os
import pickle
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 20)
from sklearn.preprocessing import LabelEncoder
from scipy.spatial import distance
from numpy.linalg import LinAlgError
from helper import *
import streamlit as st



class MusicRecommender:
    def __init__(self):
        self.config = self.load_config()
        self.songs = self.read_data()

        # Importing pre-trained models
        with open('../models/mood_gb_model.pkl', 'rb') as file:
            self.mood_gb_model = pickle.load(file)
        with open('../models/mood_encoder_model.pkl', 'rb') as file:
            self.mood_encoder_model = pickle.load(file)
        with open('../models/kmeans_model.pkl', 'rb') as file:
            self.kmeans_model = pickle.load(file)

        # Preprocess data 
        self.preprocessed_songs = self.preprocess_songs()

    def load_config(self):
        with open('../config/config.json', 'r') as f:
            config = json.load(f)
        return config

    def read_data(self):
        # Access the values from the loaded JSON
        file_id = self.config['file_id']
        url = self.config['url'].replace("file_id", file_id)  # Replace "file_id" in the URL
        output_path = self.config['output_path']
        output_file = self.config['output_file']
        data_loc = os.path.join(output_path, output_file)

        # Try loading the data, otherwise download and save
        try:
            songs = pd.read_csv(data_loc)
            print("Loading data from local repository...")
            print("Data loaded! \n")
        except:
            print("Downloading data from cloud...")
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            gdown.download(url, data_loc, quiet=True)
            print("Reading data...")
            songs = pd.read_csv(data_loc)
            print("Data loaded! \n")

        return songs
    
    def mood_prediction(self, df):
        # Load input features    
        model_input_features = self.config['gb_input_features']

        # Predict mood
        df['mood_numeric'] = self.mood_gb_model.predict(df[model_input_features])
        df['mood'] = self.mood_encoder_model.inverse_transform(df['mood_numeric'])

        return df['mood']
    
    def kmeans_prediction(self, df):
        # Load input features    
        kmeans_input_features = self.config['kmeans_input_features']

        # Predict mood
        df['kmeans_labels'] = self.kmeans_model.predict(df[kmeans_input_features])

        return df['kmeans_labels']
        

    def preprocess_songs(self):
        print("Preprocessing the data..")
        df = self.songs
        df.drop(columns=['playlist_name', 'playlist_id'], inplace=True)
        df.drop_duplicates(subset=['track_name','track_artist'], inplace=True)
        df = df[(df.duration_ms > df.duration_ms.quantile(0.01))]
        df.dropna(inplace=True)
        df['track_album_release_date'] = standardize_date(df['track_album_release_date'])
        df['release_year'] = df['track_album_release_date'].dt.year
        df = df.drop(columns=['track_album_release_date'])
        encoder = LabelEncoder()
        df['track_artist_label'] = encoder.fit_transform(df['track_artist'])
        df['track_album_id_label'] = encoder.fit_transform(df['track_album_id'])
        df['artist_track'] = df.apply(lambda x: f"{x['track_artist']} - {x['track_name']}", axis=1)

        # Predictions
        df['mood'] = self.mood_prediction(df)
        df['kmeans_labels'] = self.kmeans_prediction(df)  
        print("Preprocessing completed! \n")   
        return df   


    def recommend_by_mood(self, mood, top_n=10):
        top_n = int(top_n)
        songs = self.preprocessed_songs
        mood_musics = songs[songs['mood'] == mood].sort_values(by=['track_popularity'], ascending=False).head(300)
        # mood_musics = mood_musics[['track_id', 'track_name', 'track_artist', 'track_popularity', 
        #                            'playlist_genre', 'playlist_subgenre', 'release_year', 'mood']]

        sampled_musics = mood_musics.groupby('release_year').apply(
            lambda x: x.sample(min(len(x), max(1, top_n // len(mood_musics['release_year'].unique()))))
        ).reset_index(drop=True)

        if len(sampled_musics) < top_n:
            additional_songs = mood_musics.drop(sampled_musics.index).sample(top_n - len(sampled_musics))
            sampled_musics = pd.concat([sampled_musics, additional_songs])

        # Format output
        recommended_tracks = output_format(sampled_musics, top_n)

        return recommended_tracks

    def recommend_similar_songs(self, song_name, top_n=10):
        top_n = int(top_n)
        kmeans_input_features = self.config['kmeans_input_features']
        songs = self.preprocessed_songs
        clustering_data = songs[kmeans_input_features + ["kmeans_labels"]]

        # User input and feature extraction
        user_input = songs[songs['track_name'] == song_name]
        if user_input.empty:
            print("Song not found in the dataset.")
            return None

        # Extract features for the user's selected song using the same features as clustering_data
        num_user_input = clustering_data.loc[user_input.index]
        if num_user_input.empty:
            print("User input features not found in clustering data.")
            return None

        # Convert num_user_input to a single row vector
        user_song = num_user_input.iloc[0][kmeans_input_features].values

        # Filter songs with the same kmeans label, then drop the user's song index
        like_songs = clustering_data[
            clustering_data['kmeans_labels'] == num_user_input['kmeans_labels'].values[0]
        ]

        # Check if the user's song index exists in like_songs and drop it if it does
        common_indexes = like_songs.index.intersection(user_input.index)
        like_songs = like_songs.drop(index=common_indexes)

        # Ensure there are enough songs for the analysis
        if like_songs.empty or len(like_songs) < 20:
            print("Not enough similar songs found to make a recommendation.")
            return None

        # Calculate covariance matrix
        cov_matrix = np.cov(like_songs[kmeans_input_features].values, rowvar=False)
        try:
            inv_cov_matrix = np.linalg.inv(cov_matrix)
        except LinAlgError:
            inv_cov_matrix = np.linalg.pinv(cov_matrix)

        # Find top N similar songs using Mahalanobis distance
        def find_top_similar_songs(songs_df, user_song, inv_cov_matrix, top_n):
            distances = {}
            for idx, song_features in songs_df.iterrows():
                song_features = np.array(song_features[kmeans_input_features].values.flatten())
                distances[idx] = distance.mahalanobis(user_song, song_features, inv_cov_matrix)

            # Sort distances and get the top N indices
            sorted_distances = sorted(distances.items(), key=lambda x: x[1])
            top_similar_indices = [idx for idx, _ in sorted_distances[:top_n]]

            top_songs = songs_df.loc[top_similar_indices]
            return top_songs

        top_songs = find_top_similar_songs(like_songs, user_song, inv_cov_matrix, top_n=top_n)
        if top_songs.empty:
            print("No similar songs found.")
            return None

        # Select Musics
        recommended_tracks = songs[songs.index.isin(top_songs.index)]

        # Format output
        recommended_tracks = output_format(recommended_tracks, top_n)
        return recommended_tracks

In [2]:
recommender = MusicRecommender()

Loading data from local repository...
Data loaded! 

Preprocessing the data..
Preprocessing completed! 



In [3]:
def search_songs(songs_df, partial_name, max_results=10):
    """
    Searches for songs with names that partially match the given input.
    
    Parameters:
        songs_df (DataFrame): The DataFrame containing song information.
        partial_name (str): The partial name of the song to search for.
        max_results (int): The maximum number of results to return.
    
    Returns:
        DataFrame: A DataFrame with matching song names and artists.
    """
    matching_songs = songs_df[songs_df['track_name'].str.contains(partial_name, case=False, na=False)]
    return matching_songs[['track_name', 'track_artist']].drop_duplicates().head(max_results)


In [4]:
def search_artist(songs_df, partial_artist, max_results=10):
    """
    Searches for artists with names that partially match the given input.
    
    Parameters:
        songs_df (DataFrame): The DataFrame containing song information.
        partial_artist (str): The partial name of the artist to search for.
        max_results (int): The maximum number of results to return.
    
    Returns:
        DataFrame: A DataFrame with matching artist names.
    """
    matching_artists = songs_df[songs_df['track_artist'].str.contains(partial_artist, case=False, na=False)]
    return matching_artists[['track_artist']].drop_duplicates().head(max_results)

In [5]:
data = recommender.preprocessed_songs
data.head(1)

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_genre,playlist_subgenre,danceability,energy,...,valence,tempo,duration_ms,release_year,track_artist_label,track_album_id_label,artist_track,mood_numeric,mood,kmeans_labels
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,pop,dance pop,0.748,0.916,...,0.518,122.036,194754,2019,2759,7688,Ed Sheeran - I Don't Care (with Justin Bieber)...,0,Energetic,3


In [6]:
data[data['track_name'] == "Someone Like You"]

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_genre,playlist_subgenre,danceability,energy,...,valence,tempo,duration_ms,release_year,track_artist_label,track_album_id_label,artist_track,mood_numeric,mood,kmeans_labels
1216,6QPKYGnAW9QozVz2dSWqRg,Someone Like You,Adele,79,7n3QJc7TBOxXtlYh4Ssll8,21,pop,dance pop,0.554,0.321,...,0.288,135.047,285240,2011,235,20538,Adele - Someone Like You,4,Relaxed,6
28185,2Xn9RATH8iAAbfipxhMMFK,Someone Like You,Dj Electrocimatic,17,28FIA1bhvxZxUAMfXuD5cn,When the Bats Go Through the Night,edm,electro house,0.732,0.915,...,0.761,135.067,152889,2019,2544,5818,Dj Electrocimatic - Someone Like You,1,Happy,1


In [7]:
data.iloc[22638]

track_id                              2Xn9RATH8iAAbfipxhMMFK
track_name                                  Someone Like You
track_artist                               Dj Electrocimatic
track_popularity                                          17
track_album_id                        28FIA1bhvxZxUAMfXuD5cn
track_album_name          When the Bats Go Through the Night
playlist_genre                                           edm
playlist_subgenre                              electro house
danceability                                           0.732
energy                                                 0.915
key                                                        1
loudness                                              -7.372
mode                                                       1
speechiness                                           0.0504
acousticness                                         0.00742
instrumentalness                                       0.831
liveness                

In [8]:
search_artist(data, "Adele")

Unnamed: 0,track_artist
1216,Adele


In [9]:
data[data['track_artist'] == "Adele"]["track_name"].reset_index(drop=True)

0                     Someone Like You
1                 Set Fire to the Rain
2     Send My Love (To Your New Lover)
3                  Rolling in the Deep
4                                Hello
5                        Right As Rain
6                                Tired
7               Melt My Heart to Stone
8                          He Won't Go
9                    Chasing Pavements
10                      Turning Tables
11                 Rolling In The Deep
Name: track_name, dtype: object

In [10]:
search_songs(data, "Hello")

Unnamed: 0,track_name,track_artist
1827,Had Me @ Hello,Olivia Holt
3781,Hello,Adele
3796,Hello,Martin Solveig
5503,Hello Hello,Fickle Friends
6281,Hello Hello,Church & AP
7456,Hello To Oblivion,Formula
8836,Hello,Eminem
9208,Hello - Feat. Dr. Dre And MC Ren,Ice Cube
9650,Hello,Ice Cube
10708,Hello Cotto,Duki


In [11]:
data = recommender.recommend_similar_songs("Someone Like You", 3)
data

Unnamed: 0,Song ID,Song Name,Artist Name,Album Name
1,3cHkjS45Xiduc6DcXtXmAo,Running Thru 3AM,Aeris Roves,Moon By Island Gardens
2,09YXhYMYIHZBHVKGEQrSQF,Return To Love,Andrea Bocelli,Return To Love
3,5WEAZtjEufELtaxqYcqCJg,Years,Astrid S,Down Low


In [12]:
data = recommender.recommend_by_mood("Happy", "1")
data

Unnamed: 0,Song ID,Song Name,Artist Name,Album Name
1,66TRwr5uJwPt15mfFkzhbi,Crank That (Soulja Boy),Soulja Boy,souljaboytellem.com


In [13]:
import scipy

In [14]:
print(pd.__version__)

2.2.3


In [32]:
import os

# # Set the environment variable directly in Python
# os.environ["SPOTIFY_AUTHORIZATION_TOKEN"] = "BQABMItNWDKLNHohohV9rbSLyxIR2l0zJBFTeWb5TRyCicSIMaZgpRynZpkcvKl_cwL4wI-ktrY4ctnpXgCS1UYSOjUM3eTfZBXZicTtU-MLfKEVh7A"
# os.environ["SPOTIFY_USER_ID"] = "08qwheo36290fql4r7sn54ely"

# Now, check the value
spotify_token = os.getenv("SPOTIFY_AUTHORIZATION_TOKEN")
print("Spotify Authorization Token:", spotify_token)

Spotify Authorization Token: None
