	# Author: Alexander Staub
	## Last changed: 2023.03.21
	## Purpose: Accessing the Spotify API


# Done to dos:
* rate limits: 10/second without rate limit extension
* changed the .append to .concat in the loop to avoid warnings 
* added the audio feature elements from spotify - all of them according to gpt-4


# Next up to dos - 23.03.28

* with the first random sample, a number of songs were not matched. Think of a way to get all the songs that weren't matched into a seperate dataframe


In [1]:
#library loading

import os
import spotipy # lightweight Python library for the Spotify Web API
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials # to access authorised Spotify data
import json # to read json files
import time # to time the code
import requests # to make http requests

In [2]:
# path specification - using os

# Get the current working directory
current_dir = os.getcwd()

# Navigate up two levels to the root directory
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))

print(root_dir)


c:\Users\User\Documents\R_Work\research\music_data


In [3]:
# get the client id and client secret for my account

# access the spotify credentials from the json file and store it as variable "f"
with open('spotify_credentials.json') as f:

    # transform the json file into a dictionary
    data = json.load(f)
    client_id = data['SPOTIPY_CLIENT_ID']
    client_secret = data['SPOTIPY_CLIENT_SECRET']

if not client_id or not client_secret:
    raise ValueError("Spotify API credentials not found in environment variables")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [5]:
# load in the random sample w isrcs into a dataframe

# Create the path to the CSV file
csv_file_path = os.path.join(root_dir, 'data', 'interim_data', 'random_sample_w_iscrcs_mb_v5_export.csv')

#create the json file path
json_file_path = os.path.join(root_dir, 'data', 'raw_data','musicbrainz', 'isrcs_from_release_with_label_v6.json')

Below is the code to read in a csv - created when interested in analyzing a random sample

In [6]:
#detect the encoding of the csv

import chardet    

rawdata = open(csv_file_path, "rb").read()
result = chardet.detect(rawdata)
charenc = result['encoding']

In [7]:
# Now read the CSV into a DataFrame
sample_songs_w_isrcs = pd.read_csv(csv_file_path, encoding=charenc)

#remove the column named "unnamed" from the sample_songs_w_isrcs dataframe
sample_songs_w_isrcs = sample_songs_w_isrcs.loc[:, ~sample_songs_w_isrcs.columns.str.contains('^Unnamed')]


Currently, the loaded dataframe includes variables of the type "object" - which means that they are some generic form of data type.
Converting all but date to string will make it easier to access spotify API

In [8]:
#loop through all variables and convert them to strings - except the date column
for col in sample_songs_w_isrcs.columns:
    if col != 'date':
        sample_songs_w_isrcs[col] = sample_songs_w_isrcs[col].astype(str)



Below is the code for reading in the json based dataframe including isrcs from all US released songs in the timeframe of interest

In [9]:
# read in json file
us_songs_w_isrcs = pd.read_json(json_file_path)



In [10]:
# drop the last column
us_songs_w_isrcs = us_songs_w_isrcs.iloc[:, :-1]

In [11]:
#loop through all variables and convert them to strings - except the date column
for col in us_songs_w_isrcs.columns:
    if col != 'release_date':
        us_songs_w_isrcs[col] = us_songs_w_isrcs[col].astype(str)

The code below is meant to:
- check the ISRC codes in the musicbrainz dataframe and collect the name of thesong, artist and acoustic charactersitics from the Spotify Api
- respect the rate limits of 10 requests per second
- save the data periodically in case of some error
- collect data about unmatched isrcs in a seperate file for later review
- not restart from 0 in case the operation fails, but check what was already collected

In [12]:
# create the save path
save_path = os.path.join(root_dir, "data", "interim_data", "spotify_isrc_sample")

print(save_path)

c:\Users\User\Documents\R_Work\research\music_data\data\interim_data\spotify_isrc_sample


In [14]:
#try whether the the dataframe already exists or whether it needs to be created

# Try whether the dataframe already exists or whether it needs to be created
try:
    df_spotify = pd.read_csv(os.path.join(save_path, "partial_spotify_data.csv"))
except (FileNotFoundError, pd.errors.EmptyDataError):  # Catching both exceptions
    df_spotify = pd.DataFrame()

try:
    with open(os.path.join(save_path, "unmatched_isrcs.json"), "r") as f:
        unmatched_isrcs = json.load(f)
except FileNotFoundError:
    unmatched_isrcs = []

# List to hold dictionaries to append to df_spotify
data_to_append = []


In [15]:
#check 
us_songs_w_isrcs.describe()

# check how many NA values are present by column in the us_songs_w_isrcs dataframe
us_songs_w_isrcs.isna().sum()

isrc                  0
track_id              0
track_title           0
artist_name           0
artist_mbid           0
release_mbid          0
release_group_mbid    0
release_date          0
dtype: int64

In [16]:
# Track where you are in the loop in case you have to restart
start_point = 0
try:
    # Check if a start_point file exists (this assumes the file holds an integer)
    with open(os.path.join(save_path, "start_point.txt"), "r") as f:
        start_point = int(f.read())
except FileNotFoundError:
    pass

Tenacity based rate limiting with exponential wait times - might be causing an issue

In [33]:
from tenacity import retry, stop_after_attempt, wait_exponential
import time

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=5, max=60))
def fetch_track_data(isrc):
    time.sleep(1/3)  # Slow down to 3 requests per second
    return sp.search(q=f'isrc:{isrc}', type='track')

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=5, max=60))
def fetch_audio_features(spotify_id):
    time.sleep(1/3)  # Slow down to 3 requests per second
    return sp.audio_features(spotify_id)[0]

Alternative, less sophisticated rate limiting to check if tenacity is causing the problem or if I have been blocked from this IP address

In [17]:
import time

def fetch_track_data(isrc):
    for _ in range(5):
        time.sleep(1/3)  # Wait 1/3 of a second before making the request
        try:
            return sp.search(q=f'isrc:{isrc}', type='track')
        except requests.exceptions.HTTPError as err:
            print(f"Error, will retry: {err}")
            time.sleep(5)  # 5 seconds delay before retry
    print("Max retries reached")
    return None

def fetch_audio_features(spotify_id):
    for _ in range(5):
        time.sleep(1/3)  # Wait 1/3 of a second before making the request
        try:
            return sp.audio_features(spotify_id)[0]
        except requests.exceptions.HTTPError as err:
            print(f"Error, will retry: {err}")
            time.sleep(5)  # 5 seconds delay before retry
    print("Max retries reached")
    return None


In [18]:
# Loop through your Musicbrainz dataframe
for i, row in us_songs_w_isrcs.iloc[start_point:].iterrows():
    
    isrc = row['isrc']
    
    try:
        # Search for track by ISRC
        results = fetch_track_data(isrc)
        
        # Check if the search returned any tracks
        if results['tracks']['items']:
            track = results['tracks']['items'][0]
            
            # Fetch audio features using Spotify ID
            #audio_features = sp.audio_features(track["id"])[0]
            audio_features = fetch_audio_features(track["id"])

            # Extract and append information to df_spotify
            if audio_features is not None:
                track_info = {
                    "ISRC": isrc,
                    "name": track["name"],
                    "artist": track["artists"][0]["name"],
                    **audio_features
                }
                data_to_append.append(track_info)
            else:
                print(f"Audio features missing for ISRC: {isrc}")
                unmatched_isrcs.append((i, isrc))
        else:
            print(f"Couldn't match ISRC: {isrc}")
            unmatched_isrcs.append((i, isrc))

    except requests.exceptions.HTTPError as err:
        if err.response.status_code == 429:  # HTTP Status Code for Too Many Requests
            retry_after = err.response.headers.get('Retry-After', None)
            if retry_after:
                print(f"Rate limit reached. Retry after {retry_after} seconds.")
            else:
                print("Rate limit reached. No Retry-After header found.")
            continue  # Retry logic is now handled by the @retry decorator
        else:
            print(f"HTTP Error for ISRC: {isrc}, Error: {err}")
            unmatched_isrcs.append((i, isrc))
        
    # Manage rate limits (Spotify API allows 10 requests per second)
    #time.sleep(0.2)
    
    # Save start_point
    with open(os.path.join(save_path, "start_point.txt"), "w") as f:
        f.write(str(i))
    
    # You may also want to save your results periodically to a CSV or JSON
    if i % 100 == 0:
        df_spotify.to_csv(os.path.join(save_path, "partial_spotify_data.csv"), index=False)
        with open(os.path.join(save_path, "unmatched_isrcs.json"), "w") as f:
            json.dump(unmatched_isrcs, f)


#Use pandas.concat to append all rows at once
if data_to_append:
    df_spotify = pd.concat([df_spotify, pd.DataFrame(data_to_append)], ignore_index=True)


# Save your results
df_spotify.to_csv(os.path.join(save_path, "complete_spotify_data.csv"), index=False)
with open(os.path.join(save_path, "unmatched_isrcs.json"), "w") as f:
    json.dump(unmatched_isrcs, f)

Max Retries reached


SpotifyException: http status: 429, code:-1 - /v1/audio-features/?ids=69Vhf1vZKV0yord49HiRi4:
 Max Retries, reason: too many 429 error responses

# First random sample of songs - without ISRC
To do:
- move this into a new Jupyter notebook

In [1]:
import os
import spotipy # lightweight Python library for the Spotify Web API
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials # to access authorised Spotify data

In [2]:
# package for accessing the local environment variables
import os

# get the client id and client secret from the environment variables
client_id = os.environ.get('SPOTIPY_CLIENT_ID')
client_secret = os.environ.get('SPOTIPY_CLIENT_SECRET')

if not client_id or not client_secret:
    raise ValueError("Spotify API credentials not found in environment variables")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [3]:
#required package

from pandas import read_csv

# loading in the csv file with the songs from billboard and musicbrainz

df_HH_mb_matched = read_csv('C:/R work/Research/music_data/data/interim_data/df_hh_and_mb_leftjoin_fuzzy.csv')

In [5]:
# draw a random observation from the dataframe, retaining only the first two columns 
random_100_df = df_HH_mb_matched.sample(100)[['Artist', 'Track']]



In [6]:
#create the song infor retreival function

# added a bit to the function to handle errors and retries
import time

#here the max retries is 3
def get_song_data(song_name, artist_name, max_retries=3):
    query = f'track:{song_name} artist:{artist_name}'
    retries = 0
    
    #while loop to retry the query if it fails n times (as defined by max_retries)
    while retries <= max_retries:
        try:
            results = sp.search(q=query, type='track', limit=1)
            
            if results['tracks']['items']:
                track = results['tracks']['items'][0]

               # Get the track's ID
                track_id = track['id']
                
                # Get the audio features for the track
                audio_features = sp.audio_features([track_id])[0]

                song_data = {
                    'song_name': track['name'],
                    'artist_name': track['artists'][0]['name'],
                    'album_name_spotify': track['album']['name'],
                    'release_date_spotify': track['album']['release_date'],
                    'popularity': track['popularity'],
                    'explicit': track['explicit'],
                    'duration_ms': track['duration_ms'],
                    # 'preview_url': track['preview_url'],
                    'acousticness': audio_features['acousticness'],
                    'danceability': audio_features['danceability'],
                    'energy': audio_features['energy'],
                    'instrumentalness': audio_features['instrumentalness'],
                    'liveness': audio_features['liveness'],
                    'loudness': audio_features['loudness'],
                    'speechiness': audio_features['speechiness'],
                    'valence': audio_features['valence'],
                    'tempo': audio_features['tempo'],
                    'key': audio_features['key'],
                    'mode': audio_features['mode'], # mode is either major or minor
                    'time_signature': audio_features['time_signature']
                }
                return song_data
            else:
                return None
        
        except Exception as e:
            print(f"Error: {e}")
            retries += 1
            sleep_time = 2 ** retries
            print(f"Retrying ({retries}/{max_retries}) after {sleep_time} seconds")
            time.sleep(sleep_time)
    
    return None



In [11]:
# create an empty dataframe
results_df = pd.DataFrame(columns=["Track", "Artist", "song_name", "artist_name", "album_name_spotify", "release_date_spotify",
                                    "popularity", "explicit", "duration_ms", "acousticness", "danceability","energy","instrumentalness",
                                    "liveness", "loudness", "speechiness", "valence", "tempo", "key", "mode", "time_signature"])

# loop over the random sample dataframe

rest_period = 0.2  # Time in seconds between requests

for index, row in random_100_df.iterrows():
    song_name = row["Track"]
    artist_name = row["Artist"]

    song_data = get_song_data(song_name, artist_name)
    if song_data:
        song_data["Track"] = song_name
        song_data["Artist"] = artist_name

        # Convert song_data dictionary to a DataFrame
        song_data_df = pd.DataFrame([song_data])

        # changed results.append to results_df.concat as per the errors in the previous code
        
        results_df = pd.concat([results_df, song_data_df], ignore_index=True)
    else:
        print(f"Song not found: {song_name} by {artist_name}")
        
    time.sleep(rest_period)  # Pause execution for the specified rest period    



  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Karma by Lloyd Banks Featuring Avant


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Area Codes by Ludacris Featuring Nate Dogg


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: If I Could Go! by Angie Martinez Featuring Lil' Mo & Sacario
Song not found: Hey Luv (Anything) by Mobb Deep Featuring 112


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Sympathy For The Devil (Remixes) by The Rolling Stones


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Don't Think I'm Not by Kandi


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Don't Think I'm Not by Kandi


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Ride Wit Me by Nelly Featuring City Spud


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_inde

Song not found: Untitled (How Does It Feel) by D'Angelo


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: What Happened To That Boy by Baby Featuring Clipse


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Move Ya Body by Nina Sky Featuring Jabba


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Gangsta Lovin' by Eve Featuring Alicia Keys


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Drift Away by Uncle Kracker Featuring Dobie Gray


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Feel It Boy by Beenie Man Featuring Janet


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Oochie Wally by QB Finest Featuring Nas & Bravehearts


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)
  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: I'm Your Angel by R. Kelly & Celine Dion


  results_df = pd.concat([results_df, song_data_df], ignore_index=True)


Song not found: Work It by Missy "Misdemeanor" Elliott
