In [87]:
# Import modules
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [118]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

NOTE: Combine functions into a class later.

In [88]:
# Import data
df = pd.read_csv('../data/processed_data.csv')

In [89]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,...,type,id,uri,track_href,analysis_url,duration_ms_y,time_signature,artist_pop,genres,track_pop
0,0,0,0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
1,1,7734,73,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
2,2,14037,14,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
3,3,21536,42,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
4,4,24404,1,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69


It's expected that there might be duplicates of songs inside the data because there exists some songs that were made by different artists but have the same title. 

## Data Pre-processing

In [90]:
# Drop duplicate songs through combining artist and song

def drop_duplicate_songs(df: pd.DataFrame):
    """
     Drops duplicate songs that exist due to different songs containing the same title.
    """
    df['artist_song'] = df.apply(lambda row: row['artist_name']+row['track_name'],axis = 1)
    return df.drop_duplicates(subset='artist_song')

print(f'Number of songs before removing duplicates: {len(df)}')
df = drop_duplicate_songs(df)
print(f'Number of songs after removing duplicates: {len(df)}')

Number of songs before removing duplicates: 67499
Number of songs after removing duplicates: 34247


In [91]:
# Convert the genres column back into a list
df['genres'] = df['genres'].apply(lambda row: row.split(' '))

## Feature Selection

The features I will be using moving forward in the building of the recommendation system are:

**Audio**
* Danceability
* Energy
* Key
* Loudness
* Mode
* Speechiness
* Acousticness
* Instrumentalness
* Liveness
* Valence
* Tempo

**Metadata**
* id: The URI of the track to obtain info about it using Spotify API
* genres: A list of genres the artists music associates with
* artist_pop: Artist popularity (0-100)
* track_pop: Track popularity (0-100)

In [92]:
# Features for consideration
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'pos', 'artist_name', 'track_uri',
       'artist_uri', 'track_name', 'album_uri', 'duration_ms_x', 'album_name',
       'name', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url',
       'duration_ms_y', 'time_signature', 'artist_pop', 'genres', 'track_pop',
       'artist_song'],
      dtype='object')

In [93]:
# Selecting metadata and relevant features

song_df = df[['artist_name', 'track_name', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'genres', 'artist_pop', 'track_pop']]

song_df.head()

Unnamed: 0,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres,artist_pop,track_pop
0,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,"[dance_pop, hip_hop, hip_pop, pop, pop_rap, r&...",74,69
6,Britney Spears,Toxic,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,"[dance_pop, pop, post-teen_pop]",84,83
19,Beyoncé,Crazy In Love,0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,"[dance_pop, pop, r&b]",86,25
46,Justin Timberlake,Rock Your Body,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,"[dance_pop, pop]",82,79
55,Shaggy,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,"[pop_rap, reggae_fusion]",75,2


## Feature Engineering

We will now perform the following feature engineering steps to prepare the data for the recommendation system:

1. TF-IDF

    TF-IDF (Term Frequency-Inverse Document Frequency) is a method of computing the importance of a word in each document and the entire corpus (collection of documents).
    In our case, we may consider each song as a document and the words in each song as the genres associated with the artist who sung each song. This will result
    in a matrix which highlights the importance of each genre across each individual song as well as all of the users commonly played songs.
    
2. Feature Scaling

    Feature scaling is required in our case to get the features in a consistent unit. It can be seen that artist_pop and track_pop are using a different unit and as a result,
    have different magnitudes compared to the remaining numeric features. We will apply normalization to scale the artist_pop and track_pop to between 0 and 1 as a majority of
    the other features appear to exist on a positive scale.

In [94]:
# Performing TF-IDF
tfidf = TfidfVectorizer()
tfidf_array = tfidf.fit_transform(song_df['genres'].apply(lambda x: " ".join(x))).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = ['genre' + '|' + i for i in tfidf.get_feature_names_out()]
tfidf_df.drop(columns='genre|unknown', inplace=True)
tfidf_df.reset_index(drop=True, inplace=True)

In [95]:
# Performing Normalization on popularities
scaler = MinMaxScaler()
song_df[['artist_pop', 'track_pop']] = scaler.fit_transform(song_df[['artist_pop', 'track_pop']])

# Performing Normalization on floating columns
float_col = song_df.iloc[:, 2:-3].columns
song_df[float_col] = scaler.fit_transform(song_df[float_col])
song_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df[['artist_pop', 'track_pop']] = scaler.fit_transform(song_df[['artist_pop', 'track_pop']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df[float_col] = scaler.fit_transform(song_df[float_col])


Unnamed: 0,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres,artist_pop,track_pop
0,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0.91498,0.813,0.363636,0.842733,0.0,0.12578,0.031225,0.007005,0.0471,0.811623,0.572105,"[dance_pop, hip_hop, hip_pop, pop, pop_rap, r&...",0.74,0.71134
6,Britney Spears,Toxic,0.783401,0.838,0.454545,0.893573,0.0,0.118503,0.025,0.025126,0.242,0.925852,0.652266,"[dance_pop, pop, post-teen_pop]",0.84,0.85567
19,Beyoncé,Crazy In Love,0.672065,0.758,0.181818,0.85105,0.0,0.218295,0.00239,0.0,0.0598,0.702405,0.452624,"[dance_pop, pop, r&b]",0.86,0.257732
46,Justin Timberlake,Rock Your Body,0.902834,0.714,0.363636,0.859462,0.0,0.14657,0.201807,0.000235,0.0521,0.818637,0.460435,"[dance_pop, pop]",0.82,0.814433
55,Shaggy,It Wasn't Me,0.86336,0.606,0.0,0.882707,1.0,0.074116,0.056325,0.0,0.313,0.655311,0.432103,"[pop_rap, reggae_fusion]",0.75,0.020619


In [105]:
# Concatenate features
print(tfidf_df.shape,
song_df.shape)

# Note: Need to reset index otherwise concat will increase number of rows due to indicies being stored in memory
tfidf_df.reset_index(drop=True, inplace=True)
song_df.reset_index(drop=True, inplace=True)

final_df = pd.concat([song_df, tfidf_df], axis = 1)

(34247, 2146) (34247, 16)


In [110]:
# Add song id to each song to be able to access it using Spotify API
final_df['id'] = df['id'].values

In [116]:
# Save modified features
float_cols = song_df.dtypes[song_df.dtypes == 'float64'].index.values
song_df.to_csv('../data/intermediate/float_song_features.csv', index=False)

In [117]:
# Save complete features (with TF-IDF)
final_df.to_csv('../data/intermediate/all_song_features.csv', index=False)