# Spotify API Scraping for Creating Datasets
### I used the code (but slightly modified it to my needs to include the `release_date`) from this [lovely article](https://www.linkedin.com/pulse/extracting-your-fav-playlist-info-spotifys-api-samantha-jones/)!

In [1]:
# pip install spotifyscraper

In [2]:
# pip install pathlib

In [3]:
# pip install ruamel-yaml

In [4]:
# pip install spotipy

In [5]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import time

In [7]:
cid = '6ff98eca336346ee942d607cc2d23879'
secret = '7d7292ad3cf0420e8f270e7d049f40ba'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [8]:
# Pagination (to extract more than 100 songs at a time)
def call_playlist(creator, playlist_id):
    # Step 1: Initialize DataFrame and other variables
    playlist_features_list = ["artist", "album", "track_name", "track_id", "release_date", "danceability", "energy", "key", "loudness", "mode", "speechiness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature"]
    playlist_df = pd.DataFrame(columns=playlist_features_list)
    offset = 0
    total_tracks = sp.user_playlist_tracks(creator, playlist_id)["total"]
    
    # Step 2: Fetch tracks with pagination
    while offset < total_tracks:
        playlist = sp.user_playlist_tracks(creator, playlist_id, offset=offset)["items"]
        for track in playlist:
            playlist_features = {}
            playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
            playlist_features["album"] = track["track"]["album"]["name"]
            playlist_features["track_name"] = track["track"]["name"]
            playlist_features["track_id"] = track["track"]["id"]
            playlist_features["release_date"] = track["track"]["album"]["release_date"]
            audio_features = sp.audio_features(playlist_features["track_id"])[0]
            for feature in playlist_features_list[5:]:
                playlist_features[feature] = audio_features[feature]
            track_df = pd.DataFrame(playlist_features, index=[0])
            playlist_df = pd.concat([playlist_df, track_df], ignore_index=True)
        offset += 100
    # Step 3: Return DataFrame
    return playlist_df

In [9]:
# Function to fetch audio features with retry logic
def fetch_audio_features(track_id):
    retries = 10  # Maximum number of retry attempts
    for _ in range(retries):
        try:
            return sp.audio_features(track_id)[0]
        except spotipy.SpotifyException as e:
            if e.http_status == 429:
                # Retry after a fixed delay
                retry_after = int(e.headers.get('Retry-After', 10))  # Default to 10 seconds if no Retry-After header
                print(f"Rate limited. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                raise  # Re-raise the exception if it's not a 429 error
    raise Exception("Max retries reached, unable to fetch audio features")

In [10]:
# https://open.spotify.com/playlist/5yAPuepGnApi5yc4QoZMDl
# "old" Playlist compiled by "emmabittinger" (this is a test)
old_playlist = call_playlist("spotify","5yAPuepGnApi5yc4QoZMDl")

In [11]:
old_playlist.head()

Unnamed: 0,artist,album,track_name,track_id,release_date,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Various Artists,Cars (Versión de Colección),"Life Is A Highway - From ""Cars""/Soundtrack Ver...",1QezVl06xBzPfgJ2HXST5d,2006-06-06,0.561,0.932,5,-5.475,1,0.0584,0.0,0.181,0.67,103.062,275640,4
1,Anthem Lights,Covers Part IV,Don't Stop Believing,0wBqAqxUygzHrUgw0MTJ6J,2015-07-17,0.516,0.391,10,-7.319,1,0.0315,0.0,0.144,0.395,117.873,218644,4
2,Bruce Springsteen,Born In The U.S.A.,Born in the U.S.A.,0dOg1ySSI7NkpAe89Zo0b9,1984-06-04,0.398,0.952,4,-6.042,1,0.061,7.7e-05,0.1,0.584,122.093,278680,4
3,Lynyrd Skynyrd,Second Helping (Expanded Edition),Sweet Home Alabama,7e89621JPkKaeDSTQ3avtg,1974-04-15,0.596,0.605,7,-12.145,1,0.0255,0.000331,0.0863,0.886,97.798,283800,4
4,Rick Astley,Whenever You Need Somebody,Never Gonna Give You Up,7GhIk7Il098yCjg4BQjzvb,1987-12-08,0.727,0.939,8,-11.855,1,0.0369,4.4e-05,0.151,0.916,113.33,212827,4


In [12]:
# https://open.spotify.com/playlist/66kbLWdmxWuMYeByFkqADT
# "throwbaccc" Playlist compiled by "emmabittinger"
throbaccc_playlist = call_playlist("spotify","66kbLWdmxWuMYeByFkqADT")

In [14]:
throbaccc_playlist.to_csv("throbaccc_playlist.csv")

In [15]:
# https://open.spotify.com/playlist/7dBWDKw7I8kZy0td1VYFIY
# "Songs Everyone Knows the Words To" Playlist compiled by "Ava Montgomery"
long_playlist = call_playlist("spotify","7dBWDKw7I8kZy0td1VYFIY")

In [16]:
long_playlist.to_csv("long_playlist.csv")

In [17]:
throbaccc_playlist.head()

Unnamed: 0,artist,album,track_name,track_id,release_date,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Miley Cyrus,The Time Of Our Lives,Party In The U.S.A.,5Q0Nhxo0l2bP3pNjpGJwV1,2009-01-01,0.652,0.698,10,-4.667,0,0.042,0.000115,0.0886,0.47,96.021,202067,4
1,Rihanna,Loud,What's My Name?,5FTCKvxzqy72ceS4Ujux4N,2010-11-16,0.692,0.786,2,-2.959,1,0.069,0.0,0.0797,0.583,100.025,263173,4
2,Train,"Hey, Soul Sister","Hey, Soul Sister",0KpfYajJVVGgQ32Dby7e9i,2009-08-06,0.675,0.885,1,-4.432,0,0.0436,0.0,0.086,0.768,97.03,216667,4
3,Justin Bieber,My World,One Time,6eDApnV9Jdb1nYahOlbbUh,2009-01-01,0.691,0.853,1,-2.528,0,0.0372,7.1e-05,0.082,0.762,145.999,215867,4
4,Taio Cruz,Rokstarr (International Version),Dynamite,0bg6otrW5gxNnlCqrCrXyd,2010-05-28,0.754,0.804,4,-3.177,1,0.0853,0.0,0.0329,0.818,119.968,203867,4


In [18]:
throbaccc_playlist.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,instrumentalness,liveness,valence,tempo
count,330.0,330.0,330.0,330.0,330.0,330.0,330.0,330.0
mean,0.647394,0.738092,-5.048933,0.075087,0.005875,0.177032,0.58791,121.181464
std,0.117276,0.164847,1.792816,0.061893,0.056663,0.128664,0.206436,25.593651
min,0.327,0.0565,-15.099,0.0254,0.0,0.0193,0.0765,65.043
25%,0.5785,0.67625,-5.88925,0.0377,0.0,0.088675,0.428,101.7345
50%,0.6585,0.773,-4.8235,0.0511,0.0,0.127,0.619,122.624
75%,0.72475,0.857,-3.8485,0.08805,4e-06,0.24875,0.748,132.023
max,0.979,0.981,-1.644,0.449,0.871,0.758,0.965,194.077


In [19]:
long_playlist.head()

Unnamed: 0,artist,album,track_name,track_id,release_date,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Coldplay,Viva La Vida or Death and All His Friends,Viva La Vida,1mea3bSkSGXuIRvnydlB5b,2008-05-26,0.486,0.617,5,-7.115,0,0.0287,3e-06,0.109,0.417,138.015,242373,4
1,Rihanna,Good Girl Gone Bad,Umbrella,2yPoXCs7BSIUrucMdK5PzV,2007-01-01,0.583,0.829,1,-4.603,1,0.134,0.0,0.0426,0.575,174.028,275987,4
2,*NSYNC,No Strings Attached,Bye Bye Bye,62bOmKYxYg7dhrC6gH9vFn,2000-03-21,0.61,0.926,8,-4.843,0,0.0479,0.0012,0.0821,0.861,172.638,200400,4
3,Train,"Save Me, San Francisco (Golden Gate Edition)","Hey, Soul Sister",4HlFJV71xXKIGcU3kRyttv,2010-12-01,0.673,0.886,1,-4.44,0,0.0431,0.0,0.0826,0.795,97.012,216773,4
4,Carrie Underwood,Some Hearts,Before He Cheats,0ZUo4YjG4saFnEJhdWp9Bt,2005-11-14,0.519,0.749,6,-3.318,0,0.0405,0.0,0.119,0.29,147.905,199947,4


In [20]:
long_playlist.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,instrumentalness,liveness,valence,tempo
count,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0
mean,0.614093,0.704965,-5.68251,0.079451,0.007984,0.17513,0.508641,125.870136
std,0.139569,0.17148,2.219701,0.069564,0.049434,0.1355,0.228687,26.866936
min,0.209,0.111,-18.064,0.0249,0.0,0.021,0.0385,65.997
25%,0.522,0.594,-6.682,0.0382,0.0,0.0901,0.336,106.97
50%,0.614,0.728,-5.223,0.0545,0.0,0.12,0.506,125.072
75%,0.717,0.839,-4.165,0.0853,9e-05,0.237,0.691,142.673
max,0.967,0.986,-1.848,0.449,0.616,0.77,0.969,199.935
