# This section of the code enriches the Spotify Viral Top 50 dataset for the 2020 year with song characteristics through the Spotify API (here reached using a Python library).

In [1]:
# Import necessary packages
import spotipy # library implementation of spotify API
import spotipy.util as util
import pandas as pd
import os

In [2]:
# API access keys (obtained by Tamas)
CLIENT_ID = "57435685ee4e49619c90408f70d4178d"
CLIENT_SECRET = "f07fc55f7cfd42b99315885a8e325bab"

In [3]:
# authenticate to the Spotify API
from spotipy.oauth2 import SpotifyClientCredentials

client_credentials_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

#### Be sure to change the directory to your local computer's directory. It is highly advised that you download the GitHub repo's codes as a zip and set up its root folder as your directory.

In [4]:
os.chdir('F:\OneDrive - Central European University\Courses\Winter_Term\Coding 3\Term Project\spotify-covid-analysis\data')

In [5]:
# read in the previously concatenated dataset of all Viral Top 50 datasets
spotify = pd.read_csv('spotify_viral_weekly_2020.csv')

In [6]:
# take a sneak peek at the data --> all in order
spotify.head()

Unnamed: 0,Position,Track Name,Artist,URL,Date
0,1,Pura Pura Lupa,Mahen,https://open.spotify.com/track/6Aw5k0jkFceYaya...,2020-01-02
1,2,GOODMORNINGTOKYO!,TOKYO’S REVENGE,https://open.spotify.com/track/0Q2n5yzl2XRqYbV...,2020-01-02
2,3,Blueberry faygo,Yung Anime,https://open.spotify.com/track/4ADzhwPWMQaTXNn...,2020-01-02
3,4,Catriona,Matthaios,https://open.spotify.com/track/4YMfE4r1JWADFw2...,2020-01-02
4,5,"All I Want - From ""High School Musical: The Mu...",Olivia Rodrigo,https://open.spotify.com/track/4Yxc55NX3tAXC2m...,2020-01-02


In [7]:
# create the trackID column to be able to run the Spotify API on the unique song code
spotify['trackID'] = [x.split('track/')[-1] for x in spotify['URL']]

#### This section of the script interatively runs through the dataset on batches of 100 songs so as to not overload the API requests. The features of each song as requested through the python library's respective function and all of the features are added to a dictionary.

In [8]:
# empty list, batchsize and the counter for None results
rows = []
batchsize = 100
None_counter = 0

for i in range(0,len(spotify['trackID']),batchsize):
    batch = spotify['trackID'][i:i+batchsize]
    feature_results = sp.audio_features(batch) # key function here (audio_features part of spotipy)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:',None_counter)

Number of tracks where no audio features were available: 0


In [9]:
# convert the resulting dictionary to a pandas dataframe
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
print("Shape of the dataset:", df_audio_features.shape)
df_audio_features.head() # dataset is looking good

Shape of the dataset: (2650, 18)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,audio_features,6Aw5k0jkFceYayaJoRupLT,spotify:track:6Aw5k0jkFceYayaJoRupLT,https://api.spotify.com/v1/tracks/6Aw5k0jkFceY...,https://api.spotify.com/v1/audio-analysis/6Aw5...,246964,4
1,0.907,0.539,4,-7.782,1,0.36,0.0408,0.0,0.253,0.664,124.918,audio_features,0Q2n5yzl2XRqYbV77tZNYp,spotify:track:0Q2n5yzl2XRqYbV77tZNYp,https://api.spotify.com/v1/tracks/0Q2n5yzl2XRq...,https://api.spotify.com/v1/audio-analysis/0Q2n...,150115,4
2,0.83,0.4,0,-11.032,1,0.0614,0.0224,7e-06,0.107,0.302,99.012,audio_features,4ADzhwPWMQaTXNnEP8gXnQ,spotify:track:4ADzhwPWMQaTXNnEP8gXnQ,https://api.spotify.com/v1/tracks/4ADzhwPWMQaT...,https://api.spotify.com/v1/audio-analysis/4ADz...,142068,4
3,0.71,0.388,5,-12.106,1,0.0545,0.67,0.0,0.177,0.759,114.858,audio_features,4YMfE4r1JWADFw2Fn74XW2,spotify:track:4YMfE4r1JWADFw2Fn74XW2,https://api.spotify.com/v1/tracks/4YMfE4r1JWAD...,https://api.spotify.com/v1/audio-analysis/4YMf...,204513,4
4,0.376,0.43,0,-6.585,0,0.0328,0.0902,0.0,0.0912,0.129,77.599,audio_features,4Yxc55NX3tAXC2mHRAhtcW,spotify:track:4Yxc55NX3tAXC2mHRAhtcW,https://api.spotify.com/v1/tracks/4Yxc55NX3tAX...,https://api.spotify.com/v1/audio-analysis/4Yxc...,177323,3


In [10]:
# drop unused columns to not clutter up dataset
columns_to_drop = ['analysis_url','track_href','type','uri']
df_audio_features.drop(columns_to_drop, axis=1,inplace=True)

In [11]:
# rename ID column for merging purposes
df_audio_features.rename(columns={'id': 'trackID'}, inplace=True)
df_audio_features.shape

(2650, 14)

In [12]:
# take a look at the dataset of song characteristics
df_audio_features.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,trackID,duration_ms,time_signature
0,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,6Aw5k0jkFceYayaJoRupLT,246964,4
1,0.907,0.539,4,-7.782,1,0.36,0.0408,0.0,0.253,0.664,124.918,0Q2n5yzl2XRqYbV77tZNYp,150115,4
2,0.83,0.4,0,-11.032,1,0.0614,0.0224,7e-06,0.107,0.302,99.012,4ADzhwPWMQaTXNnEP8gXnQ,142068,4
3,0.71,0.388,5,-12.106,1,0.0545,0.67,0.0,0.177,0.759,114.858,4YMfE4r1JWADFw2Fn74XW2,204513,4
4,0.376,0.43,0,-6.585,0,0.0328,0.0902,0.0,0.0912,0.129,77.599,4Yxc55NX3tAXC2mHRAhtcW,177323,3


In [13]:
# Group by the trackID as the spotify dataset of concatenated Top 50's contains repetitions (i.e. songs viral for more than a week)
grouped = df_audio_features.groupby(['trackID'], as_index=True).size()

# Observe unique counts
grouped[grouped > 1].count()

541

#### We have 541 songs throughout the 2020 Top 50 Weekly Viral Charts that showed up more than once.

In [14]:
# Take approach of dropping duplicates
df_audio_features.drop_duplicates(subset=['trackID'], inplace=True)

In [15]:
# Inspect dimensionality of audio features dataset
df_audio_features.shape

(886, 14)

#### We have 886 unique songs throughout the 2020 Top 50 Viral chart on a weekly basis.

In [16]:
# Merge the Spotify concatenated Top 50 dataset with the audio features for each song
df = pd.merge(spotify,df_audio_features,on='trackID',how='inner')

In [17]:
# Check dimensionality
df.shape

(2650, 19)

#### We have data for all of the songs of our 2020 dataset (i.e. 50 songs/week * 53 weeks = 2650 non-unique songs).

In [18]:
# Sneak peak at the data
df.head()

Unnamed: 0,Position,Track Name,Artist,URL,Date,trackID,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,1,Pura Pura Lupa,Mahen,https://open.spotify.com/track/6Aw5k0jkFceYaya...,2020-01-02,6Aw5k0jkFceYayaJoRupLT,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,246964,4
1,1,Pura Pura Lupa,Mahen,https://open.spotify.com/track/6Aw5k0jkFceYaya...,2020-01-09,6Aw5k0jkFceYayaJoRupLT,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,246964,4
2,1,Pura Pura Lupa,Mahen,https://open.spotify.com/track/6Aw5k0jkFceYaya...,2020-01-16,6Aw5k0jkFceYayaJoRupLT,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,246964,4
3,2,Pura Pura Lupa,Mahen,https://open.spotify.com/track/6Aw5k0jkFceYaya...,2020-01-23,6Aw5k0jkFceYayaJoRupLT,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,246964,4
4,4,Pura Pura Lupa,Mahen,https://open.spotify.com/track/6Aw5k0jkFceYaya...,2020-01-30,6Aw5k0jkFceYayaJoRupLT,0.529,0.446,11,-9.167,1,0.0294,0.757,0.000764,0.146,0.14,111.903,246964,4


In [19]:
# save it to CSV for Covid-19 API enrichment
df.to_csv('all_spotify_data.csv', index = False)