In [39]:
# Import modules
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

NOTE: Combine functions into a class later.

In [40]:
# Import data
df = pd.read_csv('../data/processed_data.csv')

In [41]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,...,type,id,uri,track_href,analysis_url,duration_ms_y,time_signature,artist_pop,genres,track_pop
0,0,0,0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
1,1,7734,73,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
2,2,14037,14,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
3,3,21536,42,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
4,4,24404,1,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69


It's expected that there might be duplicates of songs inside the data because there exists some songs that were made by different artists but have the same title. 

## Data Pre-processing

In [42]:
# Drop duplicate songs through combining artist and song

def drop_duplicate_songs(df: pd.DataFrame):
    """
     Drops duplicate songs that exist due to different songs containing the same title.
    """
    df['artist_song'] = df.apply(lambda row: row['artist_name']+row['track_name'],axis = 1)
    return df.drop_duplicates(subset='artist_song')

print(f'Number of songs before removing duplicates: {len(df)}')
df = drop_duplicate_songs(df)
print(f'Number of songs after removing duplicates: {len(df)}')

Number of songs before removing duplicates:        Unnamed: 0.1  Unnamed: 0  pos    artist_name               track_uri  \
0                 0           0    0  Missy Elliott  0UaMYEvWZi0ZqiDOoHU3YI   
1                 1        7734   73  Missy Elliott  0UaMYEvWZi0ZqiDOoHU3YI   
2                 2       14037   14  Missy Elliott  0UaMYEvWZi0ZqiDOoHU3YI   
3                 3       21536   42  Missy Elliott  0UaMYEvWZi0ZqiDOoHU3YI   
4                 4       24404    1  Missy Elliott  0UaMYEvWZi0ZqiDOoHU3YI   
...             ...         ...  ...            ...                     ...   
67494         67494       67496   37          Jon D  3uCHI1gfOUL5j5swEh0TcH   
67495         67495       67499   40      Big Words  0P1oO2gREMYUCoOkzYAyFu   
67496         67496       67500   41   Allan Rayman  2oM4BuruDnEvk59IvIXCwn   
67497         67497       67501   42      Jon Jason  4Ri5TTUgjM96tbQZd5Ua7V   
67498         67498       67502   43       Grizfolk  5RVuBrXVLptAEbGJdSDzL5   

       

In [56]:
# Convert the genres column back into a list
df['genres'] = df['genres'].apply(lambda row: row.split(' '))

## Feature Selection

The features I will be using moving forward in the building of the recommendation system are:

**Audio**
* Danceability
* Energy
* Key
* Loudness
* Mode
* Speechiness
* Acousticness
* Instrumentalness
* Liveness
* Valence
* Tempo

**Metadata**
* id: The URI of the track to obtain info about it using Spotify API
* genres: A list of genres the artists music associates with
* artist_pop: Artist popularity (0-100)
* track_pop: Track popularity (0-100)

In [45]:
# Features for consideration
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'pos', 'artist_name', 'track_uri',
       'artist_uri', 'track_name', 'album_uri', 'duration_ms_x', 'album_name',
       'name', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url',
       'duration_ms_y', 'time_signature', 'artist_pop', 'genres', 'track_pop',
       'artist_song'],
      dtype='object')

In [53]:
# Selecting metadata and relevant features

df = df[['artist_name', 'track_name', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'genres', 'artist_pop', 'track_pop']]

df.head()

Unnamed: 0,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres,artist_pop,track_pop
0,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,74,69
6,Britney Spears,Toxic,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,dance_pop pop post-teen_pop,84,83
19,Beyoncé,Crazy In Love,0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,dance_pop pop r&b,86,25
46,Justin Timberlake,Rock Your Body,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,dance_pop pop,82,79
55,Shaggy,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,pop_rap reggae_fusion,75,2
