In [2]:
import spotipy
import pandas as pd
import requests
import numpy as np
import time
from timeit import default_timer as timer
from datetime import timedelta
import configparser
from spotipy.oauth2 import SpotifyClientCredentials
from pandas.api.types import CategoricalDtype
import configparser
import os

In [3]:
current_dir = os.path.dirname(os.path.abspath("Ed Sheeran"))

config_path = os.path.join(current_dir, '..', 'config.ini')

config = configparser.ConfigParser()

config.read(config_path)

client_id = config.get('credentials', 'Client_ID')
client_secret = config.get('credentials', 'Client_Secret')

client_credentials_manager = SpotifyClientCredentials(
                                client_id = client_id, client_secret = client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [4]:
def timekeeper(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    if minutes > 0:
        return f"{minutes} minutes, {seconds} seconds"
    else:
        return f"{seconds} seconds"

In [6]:
def collect_spotify_data():
    start_time = timer()
    data = []

    for i in range(0, 1000, 50):
        try:
            track_results = sp.search(q='artist:Ed Sheeran', type='track', limit=50, offset=i)
            for item in track_results['tracks']['items']:
                track_info = {
                    'artist_name': item['artists'][0]['name'],
                    'track_name': item['name'],
                    'track_id': item['id'],
                    'album_name': item['album']['name'],
                    'album_id': item['album']['id'],
                    'release_date': item['album']['release_date'],
                    'popularity': item['popularity'],
                    'explicit': item['explicit']
                }
                
                audio_features = sp.audio_features(item['id'])[0]
                if audio_features is not None:
                    audio_info = {
                        'danceability': audio_features.get('danceability', float('nan')),
                        'duration_ms': audio_features.get('duration_ms', float('nan')),
                        'energy': audio_features.get('energy', float('nan')),
                        'key': audio_features.get('key', float('nan')),
                        'loudness': audio_features.get('loudness', float('nan')),
                        'mode': audio_features.get('mode', float('nan')),
                        'speechiness': audio_features.get('speechiness', float('nan')),
                        'acousticness': audio_features.get('acousticness', float('nan')),
                        'instrumentalness': audio_features.get('instrumentalness', float('nan')),
                        'liveness': audio_features.get('liveness', float('nan')),
                        'valence': audio_features.get('valence', float('nan')),
                        'tempo': audio_features.get('tempo', float('nan')),
                        'time_signature': audio_features.get('time_signature', float('nan'))
                    }
                    track_info.update(audio_info)
                else:
                    audio_info = {
                        'danceability': float('nan'),
                        'duration_ms': float('nan'),
                        'energy': float('nan'),
                        'key': float('nan'),
                        'loudness': float('nan'),
                        'mode': float('nan'),
                        'speechiness': float('nan'),
                        'acousticness': float('nan'),
                        'instrumentalness': float('nan'),
                        'liveness': float('nan'),
                        'valence': float('nan'),
                        'tempo': float('nan'),
                        'time_signature': float('nan')
                    }
                    track_info.update(audio_info)

                
                if len(item['artists']) > 1:
                    featured_artists = [artist['name'] for artist in item['artists'][1:]]
                    track_info['featured_artists'] = featured_artists
                else:
                    track_info['featured_artists'] = []

                data.append(track_info)

        except requests.exceptions.ReadTimeout as e:
            print(f"Timeout error: {e}. Retrying in 5 seconds...")
            time.sleep(5) 
        except requests.exceptions.RequestException as e:
            print(f"API error: {e}")
            break 

    df_raw = pd.DataFrame(data)
    end_time = timer()
    elapsed_time = int(end_time - start_time)
    print(f"Elapsed time:", timekeeper(elapsed_time))
    return df_raw

df_spotify = collect_spotify_data()

Elapsed time: 4 minutes, 34 seconds


In [7]:
ed = df_spotify[df_spotify['artist_name'] == 'Ed Sheeran']
ed.to_csv('raw material.csv')

In [18]:
ed = pd.read_csv("raw material.csv")
ed.drop(["Unnamed: 0"], axis=1, inplace=True)
ed.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,popularity,explicit,danceability,duration_ms,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists
0,Ed Sheeran,Shivers,50nfwKoDiSYg8zOCREWAm5,=,32iAEBstCjauDhyKpGjTuq,2021-10-25,83,False,0.788,207853,...,-2.724,1,0.0856,0.281,0.0,0.0424,0.822,141.02,4,[]
1,Ed Sheeran,Perfect,0tgVpDi06FyKpA1z0VMD4v,÷ (Deluxe),3T4tUhGYeRNVUGevb0wThu,2017-03-03,85,False,0.599,263400,...,-6.312,1,0.0232,0.163,0.0,0.106,0.168,95.05,3,[]
2,Ed Sheeran,Shape of You,7qiZfU4dY1lWllzX7mPBI3,÷ (Deluxe),3T4tUhGYeRNVUGevb0wThu,2017-03-03,86,False,0.825,233713,...,-3.183,0,0.0802,0.581,0.0,0.0931,0.931,95.977,4,[]
3,Ed Sheeran,Bad Habits,3rmo8F54jFF8OgYsqTxm5d,=,32iAEBstCjauDhyKpGjTuq,2021-10-25,80,False,0.807,230747,...,-3.745,0,0.0347,0.0451,2.8e-05,0.366,0.537,126.011,4,[]
4,Ed Sheeran,Castle on the Hill,6PCUP3dWmTjcTtXY02oFdT,÷ (Deluxe),3T4tUhGYeRNVUGevb0wThu,2017-03-03,78,False,0.461,261154,...,-4.868,1,0.0989,0.0232,1.1e-05,0.14,0.471,135.007,4,[]


In [19]:
studio_albums = [
    '=', 
    '÷ (Deluxe)', 
    'No.6 Collaborations Project', 
    'x (Deluxe Edition)', 
    '+',
    '- (Deluxe)',
    'Autumn Variations'
]

live_albums = [
    
    'x (Wembley Edition)',
]

remix_album = ['2step (The Remixes)']

compilation_album = [
    '+-=÷× (Tour Collection)', 
    '= (Tour Edition)',
    '5'
]

extended_play = [
    'Loose Change',
    'No.5 Collaborations Project', 
    'Live at the Bedford',
    'Spotify Session',
    'Live and in Session',
    'The Slumdon Bridge',
    'You Need Me',
    'Spotify Singles'
]

singles = ed['track_name'] == ed['album_name']
lp = ed['album_name'].isin(studio_albums)
ep = ed['album_name'].isin(extended_play)
remixes = ed['album_name'].isin(remix_album)
live = ed['album_name'].isin(live_albums)
comp = ed['album_name'].isin(compilation_album)

conditions = [singles, lp, ep, remixes, live, comp]
displays = ['Single', 'Studio Album', 'Extended Play', 'Remix Album', 'Live Album', 'Compilation Album']
ed['type'] = np.select(conditions, displays, default='')

ed[['artist_name', 'track_name', 'album_name', 'type']].head()

Unnamed: 0,artist_name,track_name,album_name,type
0,Ed Sheeran,Shivers,=,Studio Album
1,Ed Sheeran,Perfect,÷ (Deluxe),Studio Album
2,Ed Sheeran,Shape of You,÷ (Deluxe),Studio Album
3,Ed Sheeran,Bad Habits,=,Studio Album
4,Ed Sheeran,Castle on the Hill,÷ (Deluxe),Studio Album


In [21]:
notype = ed[ed['type'] =='']
flush = notype.index
ed = ed.drop(flush)
notype[['artist_name', 'track_name', 'album_name', 'type']]

Unnamed: 0,artist_name,track_name,album_name,type


In [22]:
ed['mode'] = ed['mode'].replace({0:'Minor', 1:'Major'})
ed[['track_name', 'mode']].head()

Unnamed: 0,track_name,mode
0,Shivers,Major
1,Perfect,Major
2,Shape of You,Minor
3,Bad Habits,Minor
4,Castle on the Hill,Major


In [23]:
ed['duration_ms'] = pd.to_timedelta(ed['duration_ms'], unit='ms')

ed['duration'] = ed['duration_ms'].dt.components['minutes'].astype(str).str.zfill(2) + ':' + ed['duration_ms'].dt.components['seconds'].astype(str).str.zfill(2)
ed.drop(columns=['duration_ms'], inplace= True)
ed[['track_name','duration']].head()

Unnamed: 0,track_name,duration
0,Shivers,03:27
1,Perfect,04:23
2,Shape of You,03:53
3,Bad Habits,03:50
4,Castle on the Hill,04:21


In [25]:
ed['release_date'] = np.where(
    ed['album_name'] == 'x (Wembley Edition)', '2015-11-13',
    np.where(
        ed['album_name'] == 'Spotify Session', '2011-01-01',
        np.where(
            ed['album_name'] == 'You Need Me', '2009-01-01',
            ed['release_date']  # Keep the original date if no match
        )
    )
)

In [26]:
ed['release_date'] = pd.to_datetime(ed['release_date'], errors='coerce')
ed['year'] = ed['release_date'].dt.year
ed['month'] = ed['release_date'].dt.month_name()
ed['day_of_the_week'] = ed['release_date'].dt.day_name()

ed[['track_name', 'year', 'month', 'day_of_the_week']].head()

Unnamed: 0,track_name,year,month,day_of_the_week
0,Shivers,2021,October,Monday
1,Perfect,2017,March,Friday
2,Shape of You,2017,March,Friday
3,Bad Habits,2021,October,Monday
4,Castle on the Hill,2017,March,Friday


In [28]:
key_dict = {
    0: "C",
    1: "C#/Db",
    2: "D",
    3: "D#/Eb",
    4: "E",
    5: "F",
    6: "F#/Gb",
    7: "G",
    8: "G#/Ab",
    9: "A",
    10: "A#/Bb",
    11: "B",
    -1: "NaN"
}

ed['key'] = ed['key'].map(key_dict).fillna("NaN")
ed[['track_name', 'key']].head()

Unnamed: 0,track_name,key
0,Shivers,D
1,Perfect,G#/Ab
2,Shape of You,C#/Db
3,Bad Habits,B
4,Castle on the Hill,D


In [29]:
ed.to_csv('Ed Sheeran Dataset.csv', index= False)