In [44]:
import numpy as np
import pandas as pd
import re
import json
import sys
import itertools
import matplotlib.pyplot as plt
import warnings
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings("ignore")

In [45]:
%matplotlib inline

In [46]:
spotify_df = pd.read_csv('data/spotify_data.csv')

In [47]:
spotify_df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.48,0.3,4,-10.06,1,0.04,0.69,0.0,0.12,0.14,133.41,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.57,0.45,3,-10.29,1,0.03,0.48,0.0,0.1,0.52,140.18,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.41,0.23,3,-13.71,1,0.03,0.34,0.0,0.09,0.14,139.83,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.39,0.25,10,-9.85,1,0.04,0.81,0.0,0.08,0.51,204.96,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.79,6,-5.42,0,0.03,0.07,0.02,0.11,0.22,171.86,244320,4


In [48]:
spotify_df.dtypes

Unnamed: 0            int64
artist_name          object
track_name           object
track_id             object
popularity            int64
year                  int64
genre                object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
time_signature        int64
dtype: object

In [49]:
# Check if this is a list or a string that looks like a list
spotify_df['genre'].values

array(['acoustic', 'acoustic', 'acoustic', ..., 'trip-hop', 'trip-hop',
       'trip-hop'], dtype=object)

Feature Engineering

In [50]:
spotify_df.tail()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
1159759,1473391,Nicola Conte,Black Spirits,0m27F0IGHLGAWhqd6ccYst,4,2011,trip-hop,0.37,0.74,10,-6.45,0,0.07,0.33,0.0,0.16,0.52,107.95,344013,3
1159760,1473392,Nicola Conte,Quiet Dawn,6er9p611eHEcUCU50j7D57,3,2011,trip-hop,0.52,0.68,7,-7.59,0,0.03,0.79,0.0,0.13,0.26,119.9,285067,4
1159761,1473393,Amon Tobin,Morning Ms Candis,7jsMMqxy1tt0rH5FzYcZTQ,2,2011,trip-hop,0.49,0.44,5,-8.51,1,0.03,0.48,0.0,0.09,0.04,100.08,214253,4
1159762,1473394,Peace Orchestra,Happy Christmas (War Is Over),77lA1InUaXztuRk2vOzD1S,0,2011,trip-hop,0.48,0.41,0,-13.34,1,0.03,0.43,0.0,0.12,0.2,133.88,239133,3
1159763,1473395,Mo' Horizons,Hit the Road Jack (Pé Na Éstrada),4oMiOwhDZEdBuzAfhzRHbi,3,2011,trip-hop,0.78,0.86,1,-7.29,0,0.12,0.22,0.0,0.06,0.86,89.99,212227,4


In [51]:
float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index
print(float_cols)

Index(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype='object')


In [52]:
ohe_cols = 'popularity'

In [53]:
# Change display options to avoid scientific notation
pd.options.display.float_format = '{:.2f}'.format

# describe(): This method is used to generate descriptive statistics that summarize the central tendency, dispersion, 
# and shape of a dataset’s distribution, excluding NaN values. 
spotify_df['popularity'].describe()

count   1159764.00
mean         18.38
std          15.89
min           0.00
25%           5.00
50%          15.00
75%          29.00
max         100.00
Name: popularity, dtype: float64

In [54]:
# creates a new column popularity_transform by scaling down the values of the popularity column and converting them to integers. 
# Each original popularity score is divided by 5, and the result is truncated to an integer. 
spotify_df['popularity_transform'] = spotify_df['popularity'].apply(lambda x: int(x / 5))

# Get the position of the 'popularity' column
popularity_index = spotify_df.columns.get_loc('popularity')

# Insert the 'popularity_transform' column next to the 'popularity' column
spotify_df.insert(popularity_index + 1, 'popularity_transform', spotify_df.pop('popularity_transform'))

In [55]:
spotify_df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,popularity_transform,year,genre,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,13,2012,acoustic,0.48,0.3,...,-10.06,1,0.04,0.69,0.0,0.12,0.14,133.41,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,10,2012,acoustic,0.57,0.45,...,-10.29,1,0.03,0.48,0.0,0.1,0.52,140.18,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,11,2012,acoustic,0.41,0.23,...,-13.71,1,0.03,0.34,0.0,0.09,0.14,139.83,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,11,2012,acoustic,0.39,0.25,...,-9.85,1,0.04,0.81,0.0,0.08,0.51,204.96,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,10,2012,acoustic,0.43,0.79,...,-5.42,0,0.03,0.07,0.02,0.11,0.22,171.86,244320,4


In [56]:
# Since tfidf cannot handle nulls, check if there are any null values in the entire DataFrame
any_null = spotify_df.isnull().values.any()
print(f"Any null values in DataFrame: {any_null}")

# Count of null values in each column
null_counts = spotify_df.isnull().sum()
print("\nCount of null values in each column:")
print(null_counts)

Any null values in DataFrame: True

Count of null values in each column:
Unnamed: 0               0
artist_name             15
track_name               1
track_id                 0
popularity               0
popularity_transform     0
year                     0
genre                    0
danceability             0
energy                   0
key                      0
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness         0
liveness                 0
valence                  0
tempo                    0
duration_ms              0
time_signature           0
dtype: int64


In [57]:
spotify_df['artist_name'] = spotify_df['artist_name'].apply(lambda d: d if isinstance(d, str) else '')
spotify_df['track_name'] = spotify_df['track_name'].apply(lambda d: d if isinstance(d, str) else '')


In [58]:
# Check if we eliminated the null values
any_null = spotify_df.isnull().values.any()
print(f"Any null values in DataFrame: {any_null}")

# Count of null values in each column
null_counts = spotify_df.isnull().sum()
print("\nCount of null values in each column:")
print(null_counts)

Any null values in DataFrame: False

Count of null values in each column:
Unnamed: 0              0
artist_name             0
track_name              0
track_id                0
popularity              0
popularity_transform    0
year                    0
genre                   0
danceability            0
energy                  0
key                     0
loudness                0
mode                    0
speechiness             0
acousticness            0
instrumentalness        0
liveness                0
valence                 0
tempo                   0
duration_ms             0
time_signature          0
dtype: int64


In [59]:
def create_one_hot_encoded_features(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    transformed_df = pd.get_dummies(df[column])
    feature_names = transformed_df.columns
    transformed_df.columns = [new_name + "|" + str(i) for i in feature_names]
    transformed_df.reset_index(drop = True, inplace = True)    
    return transformed_df

TF-IDF Section

Term Frequency Inverse Document Frequency. This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction. It is a statistical formula to convert text documents into vectors based on the relevancy of the word. It is based on the bag of the words model to create a matrix containing the information about less relevant and most relevant words in the document.

In [65]:
# Build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    # Ensure 'genre' column is properly formatted
    df['genre'] = df['genre'].apply(lambda x: str(x) if isinstance(x, str) else '')
    
    # tfidf genre lists
    tfidf = TfidfVectorizer()
    processed_genres = df['genre']

    # Check if the processed genres are not empty
    if processed_genres.str.strip().eq('').any():
        raise ValueError("Some documents are empty after preprocessing. Check the 'genre' column.")

    tfidf_matrix = tfidf.fit_transform(processed_genres)
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]
    genre_df.reset_index(drop=True, inplace=True)

    year_ohe = create_one_hot_encoded_features(df, 'year','year') * 0.5
    popularity_ohe = create_one_hot_encoded_features(df, 'popularity_transform','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)
     
    #add song id
    final['track_id'] = df['track_id'].values
    
    return final

In [66]:
complete_feature_set = create_feature_set(spotify_df, float_cols = float_cols)

In [67]:
complete_feature_set.head()

Unnamed: 0,genre|acoustic,genre|afrobeat,genre|age,genre|alt,genre|ambient,genre|and,genre|bass,genre|black,genre|blues,genre|breakbeat,...,year|2015,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,year|2022,year|2023,track_id
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53QF56cjZA9RTuuMZDrSA6
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1s8tP3jP4GZcyHDsjvw218
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7BRCa8MPiyuvr2VU3O9W0F
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63wsZUhUZLlh1OsyrZq7sz
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6nXIYClvJAfi6ujLiKqEq8


Spotify API

In [None]:
import os

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()  

spotify_client_id = os.getenv('CLIENT_ID')
spotify_client_secret = os.getenv('CLIENT_SECRET')