In [1]:
# Importing necessary libraries

import spotipy
import pandas as pd
import numpy as np
import time
from requests.exceptions import ReadTimeout
from spotipy.exceptions import SpotifyException
from timeit import default_timer as timer
from datetime import timedelta
from pandas.api.types import CategoricalDtype

In [2]:
# Spotify API tokens for access
from spotipy.oauth2 import SpotifyClientCredentials

client_id= '82e34e96969c4e11ac1b014fa90d434e'
client_secret = '17b50408fa4d48658c8bd5aa43064c9a'
client_credentials_manager = SpotifyClientCredentials(
                                client_id = client_id, client_secret = client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
# function to know how long it'll take to scrape the data

def format_time(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    if minutes > 0:
        return f"{minutes} minutes, {seconds} seconds"
    else:
        return f"{seconds} seconds"

In [4]:
# start time
start_time = timer()

# For Burna Boy as a main artist
artist_name = []
track_name = []
track_id = []
album_name = []
album_id = []
release_date = []
popularity = []
explicit = []
danceability = []
duration_ms = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
time_signature = []
featured_artists = []

# loop through results, using offset to get all tracks
for i in range(0, 1000, 50):
    try:
        track_results = sp.search(q='artist: Burna Boy', type='track', limit=50, offset=i)
        for i, t in enumerate(track_results['tracks']['items']):
        
            # get track details
            artist_name.append(t['artists'][0]['name'])
            track_name.append(t['name'])
            track_id.append(t['id'])
            album_name.append(t['album']['name'])
            album_id.append(t['album']['id'])
            release_date.append(t['album']['release_date'])
            popularity.append(t['popularity'])
            explicit.append(t['explicit'])

            # get audio features for track
            audio_features = sp.audio_features(t['id'])[0]
            danceability.append(audio_features['danceability'])
            duration_ms.append(audio_features['duration_ms'])
            energy.append(audio_features['energy'])
            key.append(audio_features['key'])
            loudness.append(audio_features['loudness'])
            mode.append(audio_features['mode'])
            speechiness.append(audio_features['speechiness'])
            acousticness.append(audio_features['acousticness'])
            instrumentalness.append(audio_features['instrumentalness'])
            liveness.append(audio_features['liveness'])
            valence.append(audio_features['valence'])
            tempo.append(audio_features['tempo'])
            time_signature.append(audio_features['time_signature'])
        

            # get featured artists
            if len(t['artists']) > 1:
                feat_artists = []
                for j in range(1, len(t['artists'])):
                    feat_artists.append(t['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
    except ReadTimeout as e:
        print(f"Error: {e}. Retrying in 5 seconds...")
        time.sleep(5)  # Retry after a short delay
    except SpotifyException as e:
        print(f"Spotify API Error: {e}")
        break

# create dataframe from lists
df_raw = pd.DataFrame({
    'artist_name': artist_name,
    'track_name': track_name,
    'track_id': track_id,
    'album_name': album_name,
    'album_id': album_id,
    'release_date': release_date,
    'duration_ms': duration_ms,
    'popularity': popularity,
    'explicit': explicit,
    'danceability': danceability,
    'energy': energy,
    'key': key,
    'loudness': loudness,
    'mode': mode,
    'speechiness': speechiness,
    'acousticness': acousticness,
    'instrumentalness': instrumentalness,
    'liveness': liveness,
    'valence': valence,
    'tempo': tempo,
    'time_signature': time_signature,
    'featured_artists': featured_artists
})



# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {format_time(elapsed_time)}")

Elapsed time: 5 minutes, 55 seconds


In [10]:
# keeping records with only 'Burna Boy' as the artist name
df_burna = df_raw[df_raw['artist_name']=='Burna Boy']

In [11]:
# Burna's Spotify has one album with two spelling variations: 'On a Spaceship' and 'On A Spaceship' which are basically the same thing and this is proof 

spaceship = (df_burna['album_name'] == 'On a Spaceship') | (df_burna['album_name'] == 'On A Spaceship')
proof = df_burna.loc[spaceship].sort_values(by= 'track_name')

In [14]:
# Having examined the content of the two album values, we'll now keep one: 'On A Spaceship

# This variation we're keeping is missing one track and we will complete it with the other one 

df_burna.at[223, 'track_name'] = ' '.join(df_burna.at[225, 'track_name'].split()[:3])
df_burna.at[223, 'album_name'] = df_burna.at[225, 'album_name'].replace('On a Spaceship', 'On A Spaceship')
df_burna = df_burna.drop(df_burna[df_burna['album_name'] == 'On a Spaceship'].index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_burna.at[223, 'track_name'] = ' '.join(df_burna.at[225, 'track_name'].split()[:3])


In [15]:
# creating a new column to categorize songs to either Singles, Albums, EPs, Compilations

# Do this separately for the main_track df and the ft df, call it features for the ft df, before concat

albums_to_filter = ['I Told Them...', 
                    'Love, Damini',
                    'Twice As Tall',
                    'African Giant',
                    'Outside',
                    'On A Spaceship',
                    'L.I.F.E - Leaving an Impact for Eternity (Deluxe Edition)']

ep = ['Redemption', 'Steel & Copper']

compilation = ['Best of Burna Boy', 'Best of Burn Series, Vol. 1']

# Define conditions for each type
single_condition = df_burna['track_name'] == df_burna['album_name']
album_condition = df_burna['album_name'].isin(albums_to_filter)
ep_condition = df_burna['album_name'].isin(ep)
compilation_condition = df_burna['album_name'].isin(compilation)

# Update 'type' based on conditions
df_burna.loc[single_condition, 'type'] = 'Single'
df_burna.loc[album_condition, 'type'] = 'Album'
df_burna.loc[ep_condition, 'type'] = 'EP'
df_burna.loc[compilation_condition, 'type'] = 'Compilation'

In [16]:
# scraping a playlist that contains all Burna Boy's features 

# start time
start_time = timer()

# Get playlist URI
playlist_uri = "https://open.spotify.com/playlist/0td2WSZ8vJjToLUjq3VELy?si=1d42b850ed6d421b"

# Get playlist tracks
playlist = sp.playlist(playlist_uri)
tracks = playlist['tracks']['items']

# Initialize data structures
song_features = []

# Define offset for pagination
offset = 0
limit = 100

# Iterate through playlist tracks, making additional requests as needed
while True:
    # Get playlist tracks with pagination
    results = sp.user_playlist_tracks(user="Uncle Pat", playlist_id=playlist_uri, offset=offset, limit=limit)

    # Check if there are more tracks to retrieve
    if not results['items']:
        break

    # Extract track information and audio features
    for track in results['items']:
        track_uri = track['track']['uri']
        track_name = track['track']['name']

        # Extract track information
        track_info = sp.track(track_uri)
        artist_name = track_info['artists'][0]['name']
        track_name = track_info['name']
        track_id = track_info['id']
        album_name = track_info['album']['name']
        album_id = track_info['album']['id']
        release_date = track_info['album']['release_date']
        duration_ms = track_info['duration_ms']
        popularity = track_info['popularity']
        explicit = track_info['explicit']
        
        
        # Extract audio features
        track_features = sp.audio_features(track_uri)

        # Check if audio features are available
        if track_features:
            track_features = track_features[0]

            # Extract audio features
            danceability = track_features['danceability']
            energy = track_features['energy']
            key = track_features['key']
            loudness = track_features['loudness']
            mode = track_features['mode']
            speechiness = track_features['speechiness']
            acousticness = track_features['acousticness']
            instrumentalness = track_features['instrumentalness']
            liveness = track_features['liveness']
            valence = track_features['valence']            
            tempo = track_features['tempo']
            time_signature = track_features['time_signature']
            
            # get featured artists
            featured_artists = []
            if len(track_info['artists']) > 1:
                feat_artists = []
                for j in range(1, len(track_info['artists'])):
                    feat_artists.append(track_info['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
   

            # Store data
            song_features.append([
                artist_name, track_name, track_id, album_name,
                album_id, release_date, duration_ms, popularity,
                explicit, danceability, energy, key, loudness,
                mode, speechiness, acousticness, instrumentalness,
                liveness, valence, tempo, time_signature, featured_artists
            ])
        else:
            print("Audio features not available for track:", track_name)

    # Update offset for next request
    offset += limit

# Save data to CSV file
ft = pd.DataFrame(
    song_features, 
    columns=[ 'artist_name', 'track_name', 'track_id', 'album_name', 'album_id',
                'release_date', 'duration_ms', 'popularity', 'explicit',
                'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness',
                'valence', 'tempo', 'time_signature', 'featured_artists'])

# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {format_time(elapsed_time)}")

Elapsed time: 1 minutes, 22 seconds


In [17]:
# creating a new column to categorize songs from this playlist to Features
ft['type']='Feature' 

In [61]:
# joining the df_burna and ft dataframes
df = pd.concat([df_burna, ft], ignore_index= True, sort= False)

In [62]:
# Eliminating rows that share the same track name 

# The main track has higher popularity than duplicate tracks that share the same name hence it was an elimination by popularity

df = df.sort_values(by= ['track_name', 'popularity'], ascending= [True, False])
df.drop_duplicates(subset= 'track_name', keep= 'first', inplace= True, ignore_index= True)

In [None]:
# check the records that wasn't assigned any song type ie. single, album, feature, ep, compilation
notype = df['type'].isna()
flush = df.loc[notype].sort_values(by= 'popularity', ascending= False) 

In [64]:
# a glance and you'll find movies album soundtracks featuring Burna Boy and these need to be accounted for 
# locate by index and edit song type
df.loc[(20, 136, 172), 'type'] = 'Feature'

In [65]:
# go ahead and drop the rest of the records that is not categorized in any song type
index_flush = flush.index
df = df.drop(index_flush)

In [66]:
# modifying the 'mode' column by replacing 0 with Minor and 1 with Major
df['mode'].replace({0:'Minor', 1:'Major'}, inplace= True)

In [67]:
# modifying the duration_ms column to appear as minutes and seconds 

df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms')

# extract the minutes and seconds components of the duration as strings
df['duration'] = df['duration_ms'].dt.components['minutes'].astype(str).str.zfill(2) + ':' + df['duration_ms'].dt.components['seconds'].astype(str).str.zfill(2)
df.drop(columns=['duration_ms'], inplace= True)

In [68]:
# creating three new fields: year, month and day a song was released from release date column
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month_name()
df['day_of_the_week'] = df['release_date'].dt.day_name()

In [70]:
fd = pd.DataFrame(df)	

In [79]:
# Time to remove the square brackets from the featured artist column 

# They have square brackets in the first place because it's a group of smaller lists in a larger list that was embbedded in the DataFrame

# We convert the data type to string no matter what, then remove the square brackets and apostrophe around it 

df['featured_artists'] = df['featured_artists'].astype(str)
df['featured_artists'] = df['featured_artists'].replace({"'": '', '"': '', r'\[':'', r'\]':''}, regex=True)


In [94]:
key_dict = {
    0: "C",
    1: "C#/Db",
    2: "D",
    3: "D#/Eb",
    4: "E",
    5: "F",
    6: "F#/Gb",
    7: "G",
    8: "G#/Ab",
    9: "A",
    10: "A#/Bb",
    11: "B",
    -1: "NaN"
}

df['key'] = df['key'].map(key_dict).fillna("NaN")


In [95]:
df.to_csv('Burna Boy Discography.csv', index= False)