In [1]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [2]:
%matplotlib inline

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [4]:
spotify_data = pd.read_csv('dataset.csv')

In [5]:
spotify_data.head(10)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic
5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,0.688,0.481,...,-8.807,1,0.105,0.289,0.0,0.189,0.666,98.017,4,acoustic
6,6,6Vc5wAMmXdKIAM7WUoEb7N,A Great Big World;Christina Aguilera,Is There Anybody Out There?,Say Something,74,229400,False,0.407,0.147,...,-8.822,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic
7,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,80,242946,False,0.703,0.444,...,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
8,8,0IktbUcnAGrvD03AWnz3Q8,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,74,189613,False,0.625,0.414,...,-8.7,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic
9,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,...,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic


In [6]:
spotify_data.dtypes

Unnamed: 0            int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object

In [7]:
spotify_data['track_genre'].values[0]

'acoustic'

In [8]:
spotify_data['genre'] = spotify_data['track_genre'].apply(lambda x: [x])

In [9]:
spotify_data['genre'].values[0]

['acoustic']

In [10]:
spotify_data['artists'].values[0]

'Gen Hoshino'

In [11]:
spotify_data['artists_upd'] = spotify_data['artists'].apply(lambda x: [x])

In [12]:
spotify_data['artists_upd'].values[0]

['Gen Hoshino']

In [13]:
spotify_data[spotify_data['genre'].apply(lambda x: not x)]

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,genre,artists_upd


In [14]:
spotify_data['artists_song'] = spotify_data.apply(lambda row: row['artists_upd'][0]+row['track_name'],axis = 1)

In [15]:
spotify_data.sort_values(['artists_song'], ascending = False, inplace = True)

In [16]:
spotify_data[spotify_data['track_name'] == 'Hunger']

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,genre,artists_upd,artists_song
9,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,...,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,[acoustic],[Ross Copperman],Ross CoppermanHunger
79110,79110,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,...,0.426,0.00419,0.0735,0.196,78.899,4,piano,[piano],[Ross Copperman],Ross CoppermanHunger
39337,39337,3ksrirMSqxsvNebKmx5UEM,Hans Zimmer,Hans Zimmer: Epic Scores,Hunger,13,395720,False,0.31,0.329,...,0.146,0.889,0.111,0.0352,111.923,4,german,[german],[Hans Zimmer],Hans ZimmerHunger


In [17]:
artists_exploded = spotify_data[['artists_upd','track_id']].explode('artists_upd')

In [18]:
artists_exploded_enriched = artists_exploded.merge(spotify_data, how = 'left', left_on = 'artists_upd',right_on = 'artists')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genre.isnull()]

In [19]:
artists_exploded_enriched_nonnull[artists_exploded_enriched_nonnull['track_id_y'] =='7k9GuJYLp2AzqokyEdwEw2']

Unnamed: 0.1,artists_upd_x,track_id_x,Unnamed: 0,track_id_y,artists,album_name,track_name,popularity,duration_ms,explicit,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,genre,artists_upd_y,artists_song
728770,Ross Copperman,5dH4xnErpUOIDrGySWGkfJ,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,[acoustic],[Ross Copperman],Ross CoppermanHunger
728771,Ross Copperman,5dH4xnErpUOIDrGySWGkfJ,79110,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,piano,[piano],[Ross Copperman],Ross CoppermanHunger
728778,Ross Copperman,5dH4xnErpUOIDrGySWGkfJ,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,[acoustic],[Ross Copperman],Ross CoppermanHunger
728779,Ross Copperman,5dH4xnErpUOIDrGySWGkfJ,79110,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,piano,[piano],[Ross Copperman],Ross CoppermanHunger
728786,Ross Copperman,7k9GuJYLp2AzqokyEdwEw2,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,[acoustic],[Ross Copperman],Ross CoppermanHunger
728787,Ross Copperman,7k9GuJYLp2AzqokyEdwEw2,79110,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,piano,[piano],[Ross Copperman],Ross CoppermanHunger
728794,Ross Copperman,7k9GuJYLp2AzqokyEdwEw2,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,[acoustic],[Ross Copperman],Ross CoppermanHunger
728795,Ross Copperman,7k9GuJYLp2AzqokyEdwEw2,79110,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,piano,[piano],[Ross Copperman],Ross CoppermanHunger
728802,Ross Copperman,0v0pc1lIt5p6EBX7pnfOGF,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,[acoustic],[Ross Copperman],Ross CoppermanHunger
728803,Ross Copperman,0v0pc1lIt5p6EBX7pnfOGF,79110,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,...,0.426,0.00419,0.0735,0.196,78.899,4,piano,[piano],[Ross Copperman],Ross CoppermanHunger


In [20]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('track_id_y')['genre'].apply(list).reset_index()

In [21]:
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genre'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [22]:
artists_genres_consolidated.head()

Unnamed: 0,track_id_y,genre,consolidates_genre_lists
0,0000vdREvCVMxbQTkS888c,[[german]],[german]
1,000CC8EParg64OmTxVnZ0p,"[[club], [club], [club], [club], [club], [club...",[club]
2,000Iz0K615UepwSJ5z2RE5,[[minimal-techno]],[minimal-techno]
3,000RDCYioLteXcutOjeweY,"[[hip-hop], [hip-hop], [hip-hop], [hip-hop], [...",[hip-hop]
4,000qpdoc97IMTBvF8gwcpy,"[[minimal-techno], [minimal-techno], [minimal-...",[minimal-techno]


In [23]:
spotify_data = spotify_data.merge(
    artists_genres_consolidated[['track_id_y','consolidates_genre_lists']],
    left_on='track_id',        
    right_on='track_id_y',     
    how='left'
)

In [24]:
spotify_data.tail()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,liveness,valence,tempo,time_signature,track_genre,genre,artists_upd,artists_song,track_id_y,consolidates_genre_lists
113995,14426,5qtlopq4SnnvVeiQVt3M0n,"""Puppy Dog Pals"" Cast",Puppy Dog Pals: Disney Junior Music,Puppy Dog Pals Main Title Theme,60,57789,False,0.781,0.936,...,0.202,0.873,182.148,4,children,[children],"[""Puppy Dog Pals"" Cast]","""Puppy Dog Pals"" CastPuppy Dog Pals Main Title...",5qtlopq4SnnvVeiQVt3M0n,[children]
113996,14615,4aY2hh55axhL2qYYqXNoOM,"""Puppy Dog Pals"" Cast",Puppy Dog Pals: Disney Junior Music,Going on a Mission,55,38144,False,0.629,0.776,...,0.093,0.957,93.937,3,children,[children],"[""Puppy Dog Pals"" Cast]","""Puppy Dog Pals"" CastGoing on a Mission",4aY2hh55axhL2qYYqXNoOM,[children]
113997,105160,1d4ZvL8uuUPTEAnocC3zEa,!nvite,strolling,strolling,41,138875,False,0.857,0.381,...,0.126,0.329,84.997,4,study,[study],[!nvite],!nvitestrolling,1d4ZvL8uuUPTEAnocC3zEa,[study]
113998,105831,3v6ypsJzaoY2xgYp6mMJfM,!nvite,pagadoff,pagadoff,5,135860,False,0.784,0.657,...,0.131,0.501,84.997,4,study,[study],[!nvite],!nvitepagadoff,3v6ypsJzaoY2xgYp6mMJfM,[study]
113999,65900,1kR4gIb7nGxHPI3D2ifs59,,,,0,0,False,0.501,0.583,...,0.0747,0.734,138.391,4,k-pop,[k-pop],[nan],,1kR4gIb7nGxHPI3D2ifs59,[k-pop]


In [25]:
float_cols =  spotify_data.dtypes[spotify_data.dtypes == 'float64'].index.values

In [27]:
ohe_cols = 'popularity'

In [29]:
spotify_data['popularity'].describe()

count    114000.000000
mean         33.238535
std          22.305078
min           0.000000
25%          17.000000
50%          35.000000
75%          50.000000
max         100.000000
Name: popularity, dtype: float64

In [31]:
spotify_data['popularity_red'] = spotify_data['popularity'].apply(lambda x: int(x/5))

In [32]:
spotify_data['consolidates_genre_lists'] = spotify_data['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [33]:
spotify_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,valence,tempo,time_signature,track_genre,genre,artists_upd,artists_song,track_id_y,consolidates_genre_lists,popularity_red
0,45933,1t4LWH3LrFAwCTHgUKM16s,龍藏Ryuzo,ANISON Explosion~Acoustic Solo Guitar~,紅蓮の弓矢 (Instrumental),20,162549,False,0.424,0.508,...,0.464,177.591,4,guitar,[guitar],[龍藏Ryuzo],龍藏Ryuzo紅蓮の弓矢 (Instrumental),1t4LWH3LrFAwCTHgUKM16s,[guitar],4
1,45585,5Qn6Ys1fHlef8zgCLqCdud,龍藏Ryuzo,ANISON Explosion~Acoustic Solo Guitar~,ゲゲゲの鬼太郎 (Instrumental),23,168137,False,0.571,0.325,...,0.32,116.457,3,guitar,[guitar],[龍藏Ryuzo],龍藏Ryuzoゲゲゲの鬼太郎 (Instrumental),5Qn6Ys1fHlef8zgCLqCdud,[guitar],4
2,12684,1Q5d3X55pI7nBXYAdZ0g8Z,黃敏華,關不掉的聲音,堤岸,20,218733,False,0.549,0.478,...,0.341,125.917,4,cantopop,[cantopop],[黃敏華],黃敏華堤岸,1Q5d3X55pI7nBXYAdZ0g8Z,[cantopop],4
3,70422,4xZIMRwaaBx7CZMmM6KLuh,黃小琥,簡單不簡單,沒那麽簡單,56,310133,False,0.334,0.431,...,0.212,201.701,3,mandopop,[mandopop],[黃小琥],黃小琥沒那麽簡單,4xZIMRwaaBx7CZMmM6KLuh,[mandopop],11
4,70870,0xCXMZlt1QfWfhtuTIFBpk,黃妃,水水水,溫暖的所在,23,291400,False,0.549,0.45,...,0.234,90.062,3,mandopop,[mandopop],[黃妃],黃妃溫暖的所在,0xCXMZlt1QfWfhtuTIFBpk,[mandopop],4


In [34]:
def ohe_prep(df, column, new_name): 
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [41]:
def create_feature_set(df, float_cols):
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')    
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe], axis = 1)
     
    #add song id
    final['id']=df['track_id'].values
    
    return final

In [42]:
complete_feature_set = create_feature_set(spotify_data, float_cols=float_cols)

In [43]:
complete_feature_set.head()

Unnamed: 0,genre|acoustic,genre|afrobeat,genre|age,genre|alt,genre|alternative,genre|ambient,genre|and,genre|anime,genre|bass,genre|black,...,pop|12,pop|13,pop|14,pop|15,pop|16,pop|17,pop|18,pop|19,pop|20,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1t4LWH3LrFAwCTHgUKM16s
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5Qn6Ys1fHlef8zgCLqCdud
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1Q5d3X55pI7nBXYAdZ0g8Z
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4xZIMRwaaBx7CZMmM6KLuh
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0xCXMZlt1QfWfhtuTIFBpk
