In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os
import pandas as pd
import time
from datetime import datetime


In [2]:
load_dotenv() # load environment variables

client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

# spotipy setup
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=20)



In [3]:
try:
    sp = spotipy.Spotify(
        client_credentials_manager=client_credentials_manager,
        requests_timeout=10
    )
    # Test the connection
    sp.search(q='test', limit=1)
    print("Spotify connection successful!")
except Exception as e:
    print(f"Authentication error: {e}")

Spotify connection successful!


In [4]:
#r = sp.search(q='Dont stop me now', type='track', limit=1)
#  r

{'tracks': {'href': 'https://api.spotify.com/v1/search?offset=0&limit=1&query=Dont%20stop%20me%20now&type=track',
  'limit': 1,
  'next': 'https://api.spotify.com/v1/search?offset=1&limit=1&query=Dont%20stop%20me%20now&type=track',
  'offset': 0,
  'previous': None,
  'total': 1000,
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1dfeR4HaWDbWqFHLkxsg1d'},
       'href': 'https://api.spotify.com/v1/artists/1dfeR4HaWDbWqFHLkxsg1d',
       'id': '1dfeR4HaWDbWqFHLkxsg1d',
       'name': 'Queen',
       'type': 'artist',
       'uri': 'spotify:artist:1dfeR4HaWDbWqFHLkxsg1d'}],
     'available_markets': ['AR',
      'AU',
      'AT',
      'BE',
      'BO',
      'BR',
      'BG',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DK',
      'DO',
      'DE',
      'EC',
      'EE',
      'SV',
      'FI',
      'FR',
      'GR',
      'GT',
      'HN',
      'HK',
      'HU',
      'IS',
      'IE'

In [5]:
genres = sp.artist('1dfeR4HaWDbWqFHLkxsg1d')
genres['genres']

['classic rock', 'glam rock', 'rock']

In [6]:
# Load million songs dataset
million_songs_df = pd.read_csv('../data/3_combined_clean_dataset.csv')

# Process settings
BATCH_SIZE = 25
SLEEP_TIME = 45

# Get the last processed index from existing backup file
backup_file = '../data/4_spotify_million_tracks.csv'
if os.path.exists(backup_file):
    processed_df = pd.read_csv (backup_file) #,
        #quoting=csv.QUOTE_MINIMAL,  # Handle quotes more carefully
        #escapechar='\\',  # Use backslash as escape character
        #encoding='utf-8'  # Explicitly specify encoding)
    

    start_index = len(processed_df)
    print(f'Resuming from index {start_index}')
else:
    start_index = 0
    print('Starting new processing')

# Process in batches
for start_idx in range(start_index, len(million_songs_df), BATCH_SIZE):
    batch = million_songs_df.iloc[start_idx:start_idx + BATCH_SIZE]
    tracks_data = []
    
    print(f'\nProcessing batch {start_idx//BATCH_SIZE + 1} of {len(million_songs_df)//BATCH_SIZE + 1}')
    print(f'Processing rows {start_idx} to {min(start_idx + BATCH_SIZE, len(million_songs_df))}')
    
    for i, row in batch.iterrows():
        try:
            # Use 'title' instead of 'song_title'
            results = sp.search(q=f'track:{row["title"]} artist:{row["artist"]}', type='track', limit=1)
            
            if results['tracks']['items']:
                track = results['tracks']['items'][0]
                artist_id = track['artists'][0]['id']
                genres = sp.artist(artist_id)['genres']
                print("Genres: ", genres)

                # Update track_data dictionary to use correct column names
                track_data = {
                    'original_title': row['title'],  # Changed from 'song_title'
                    'original_artist': row['artist'],
                    'spotify_title': track['name'],
                    'spotify_artist': track['artists'][0]['name'],
                    'album': track['album']['name'],
                    'release_date': track['album']['release_date'],
                    'popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
                    'genres': genres if genres else None
                }
                tracks_data.append(track_data)
                
        except Exception as e:
            print(f'Error processing track {row["title"]}: {str(e)}')  # Changed from 'song_title'
            continue

    # Save batch results
    if tracks_data:
        temp_df = pd.DataFrame(tracks_data)
        temp_df.to_csv(backup_file, 
                    mode='a', 
                    header=not os.path.exists(backup_file), 
                    index=False)
        
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f'Batch completed at {timestamp}')
        print(f'Processed {len(tracks_data)} tracks in this batch')
        print(f'Total tracks processed: {start_idx + len(tracks_data)}')
        print(f'Last track processed: {row["title"]}')
        print(f'Last track genres: {genres if genres else "No genres found"}')
        
        if start_idx + BATCH_SIZE < len(million_songs_df):  # don't sleep after the last batch
            print(f'Sleeping for {SLEEP_TIME} seconds...')
            print('-' * 50)
            time.sleep(SLEEP_TIME)

print(f'\nAll batches processed!')
print(f'Final total tracks processed: {start_idx + len(tracks_data)}')

Resuming from index 7731

Processing batch 310 of 402
Processing rows 7731 to 7756
Genres:  ['cool jazz', 'jazz vibraphone', 'latin jazz']
Genres:  ['lovers rock']
Genres:  ['acoustic blues', 'blues', 'country blues', 'gospel blues', 'texas blues', 'traditional blues']
Genres:  ['experimental', 'experimental vocal', 'industrial', 'laboratorio']
Genres:  []
Genres:  ['gothic black metal', 'gothic metal', 'melodic death metal', 'metal', 'portuguese black metal', 'portuguese metal', 'portuguese rock', 'power metal']
Genres:  ['new age']
Genres:  ['banjo', 'country gospel', 'western swing']
Genres:  ['atmospheric post-metal', 'drone metal', 'post-metal', 'sludge metal']
Genres:  ['ccm', 'christian alternative rock', 'christian music', 'christian rock']
Genres:  ['detroit hip hop', 'kansas city hip hop', 'pop rap']
Genres:  ['acoustic blues', 'blues', 'country blues', 'harmonica blues', 'piedmont blues', 'traditional blues']
Genres:  ['grunge pop']
Genres:  []
Genres:  ['big band', "man's o

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:anna bolena 1997 digital remaster alcun potria ascoltarti artist:maria callasgianni raimondigabriella carturanplinio clabassinicola rossilemenicoro del teatro alla scala_ milanonoberto molaorchestra del teatro alla scala_ milanogianandrea gavazzeni', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Query exceeds maximum length of 250 characters


Error processing track anna bolena 1997 digital remaster alcun potria ascoltarti: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3Aanna+bolena+1997+digital+remaster+alcun+potria+ascoltarti+artist%3Amaria+callasgianni+raimondigabriella+carturanplinio+clabassinicola+rossilemenicoro+del+teatro+alla+scala_+milanonoberto+molaorchestra+del+teatro+alla+scala_+milanogianandrea+gavazzeni&limit=1&offset=0&type=track:
 Query exceeds maximum length of 250 characters, reason: None
Genres:  ['classic country pop', 'traditional country']
Genres:  []
Genres:  ['c-pop', 'classic mandopop']
Genres:  ['orchestral soundtrack', 'soundtrack']
Genres:  ['dance pop', 'pop', 'urban contemporary']
Genres:  ['album rock', 'classic canadian rock', 'classic rock', 'glam metal', 'hard rock']
Genres:  ['latin arena pop', 'latin pop', 'spanish pop', 'spanish pop rock']
Genres:  []
Genres:  ['spanish pop']
Genres:  ['contemporary country', 'country', 'country dawn', 'country road']
Genres:  ['ab

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:turandot 2008 digital remaster_ act iii scene i nessum dorma artist:maria callaseugenio fernandielisabeth schwarzkopfgiuseppe nessinicola zaccariamario borriellorenato ercolanipiero de palmagiulio maurielisabetta fuscopinuccia perotticoro del teatro alla scala_ milanoorchestra del teatro alla scala_ milanotul', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Query exceeds maximum length of 250 characters


Genres:  ['classic country pop', 'honky tonk', 'traditional country', 'western swing']
Error processing track turandot 2008 digital remaster_ act iii scene i nessum dorma: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3Aturandot+2008+digital+remaster_+act+iii+scene+i+nessum+dorma+artist%3Amaria+callaseugenio+fernandielisabeth+schwarzkopfgiuseppe+nessinicola+zaccariamario+borriellorenato+ercolanipiero+de+palmagiulio+maurielisabetta+fuscopinuccia+perotticoro+del+teatro+alla+scala_+milanoorchestra+del+teatro+alla+scala_+milanotul&limit=1&offset=0&type=track:
 Query exceeds maximum length of 250 characters, reason: None
Genres:  ['bhangra', 'classic pakistani pop', 'pakistani pop', 'sufi']
Genres:  ['british soul', 'new wave pop']
Genres:  ['deep soundtrack', 'soundtrack']
Genres:  ['chanson']
Genres:  ["man's orchestra"]
Genres:  []
Genres:  ['turntablism']
Genres:  ['blues', 'boogie-woogie', 'louisiana blues', 'memphis blues', 'new orleans blues', 'new orleans sou