In [1]:
import pandas as pd
import pickle
import unicodedata
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [18]:
songs = pd.read_pickle('songsWeeklyRanks.pkl')

In [19]:
songs["SpotifyID"] = None

In [20]:
cid = ""
secret = ""

#initiating Spotify client
client_credentials_manager = SpotifyClientCredentials(client_id = cid, client_secret = secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [11]:
#removes special characters such as "é,ñ" for example "Michael Bublé" to "Michael Buble"
def getStringWithoutSpecialAccent(string):
    return ''.join((c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) != 'Mn'))

#getSongID takes track's name, track's Artist, and track results
#iterate through the track results to get the name in what spotify returned and
# test those results against what song and artist we have in the data frame
def getSongID(trackName, trackArtist, trackResults):
    for track in trackResults['tracks']['items']:
        if (track['name'].lower() == trackName.lower()) & (track['artists'][0]['name'].lower() == trackArtist.lower()):
            return track['id']
    return None

#getAlbumID takes in album's name, artist, and search results (which contains the search results after searching
#for the album using the spotify API)
def getAlbumID(albumName, albumArtist, searchResult):
    for album in searchResult['albums']['items']:
        searchResultAlbumName  = getStringWithoutSpecialAccent(album['name'].lower()) #what spotify gave us
        queryAlbumName         = getStringWithoutSpecialAccent(albumName.lower()) #what we have in the data
        searchResultArtistName = getStringWithoutSpecialAccent(album['artists'][0]['name'].lower())
        queryArtistName        = getStringWithoutSpecialAccent(albumArtist.lower())
        if searchResultArtistName == queryArtistName: #validating that what spotify returned is what we are looking for
            if queryAlbumName in searchResultAlbumName: #the results match
                return album['id']
        
            
    return None

#getAlbumSongs takes in album's artist and the album track results (which is a list of )
#it gets the songs in the album and returns it
def getAlbumSongs(albumArtist, albumTracksResults):
    albumSongDF = pd.DataFrame(columns=["Title","Artist", "SpotifyID"])
    for songs in albumTracksResults['items']:
        newRow = pd.Series(data={"Title": songs['name'],"Artist": albumArtist, "SpotifyID": songs['id']})
        albumSongDF = albumSongDF.append(newRow, ignore_index=True)
    return albumSongDF

In [51]:
##create two df
songsToDeleteDF = pd.DataFrame(columns=["Title","Artist", "SpotifyID"])
albumSongToAddDF = pd.DataFrame(columns=["Date","Title","Artist","Rank", "SpotifyID"])

spotifyIDForSongsCache = {} #stores spotify ids for songs that you already queried
songsDSForAlbumCache = {} #stores DataFrame of Songs (Title,Artist,SpotifyID) from an Album

#iterate through all song rows and find SpotifyID
for index, row in songs.iterrows(): 
    songNameAndArtist = row['Title'] + "-" + row['Artist'] #used for key values on caches
    if spotifyIDForSongsCache.get(songNameAndArtist) is not None: #if it exists in cache, use that instead
        #print(songNameAndArtist + " Used Song Cache")
        songID = spotifyIDForSongsCache[songNameAndArtist]
        songs.SpotifyID.iloc[index] = songID
        continue
    if songsDSForAlbumCache.get(songNameAndArtist) is not None: #if it exists in cache, use that instead
        #print(songNameAndArtist + " Used Album Cache")
        songsFromAlbum = songsDSForAlbumCache[songNameAndArtist]
        songsFromAlbum["Date"] = row["Date"]
        songsFromAlbum["Rank"] = row["Rank"]
        albumSongToAddDF = pd.concat([albumSongToAddDF,songsFromAlbum], ignore_index=True)
        continue
    
    #get songID from Spotify Search API
    trackResults = sp.search(q=row['Title'], type='track', market= 'US', limit=10,offset=0)
    songID = getSongID(row['Title'], row['Artist'], trackResults)
    #If songID is not found, we assume it is an album and search for its albumID
    if  songID == None:
        #Delete the row from songs DF that does not have spotifyID since they are an album
        songsToDeleteDF = songsToDeleteDF.append(row, ignore_index=True)
        searchResult = sp.search(q=row['Title'], type='album', market= 'US', limit=10,offset=0) #search results for album
        albumID = getAlbumID(row['Title'], row['Artist'], searchResult) #parse results
        #If we find an albumID, we get all the songs from that album and add it to the DF
        if albumID != None:
            albumTracksResults = sp.album_tracks(albumID)
            songsFromAlbum = getAlbumSongs(row['Artist'],albumTracksResults)
            #cache
            keyToStore = songNameAndArtist
            songsDSForAlbumCache[keyToStore] = songsFromAlbum
            #add date and rank
            songsFromAlbum["Date"] = row["Date"]
            songsFromAlbum["Rank"] = row["Rank"]
            
            albumSongToAddDF = pd.concat([albumSongToAddDF,songsFromAlbum], ignore_index=True)
    else:
        #Add the spotifyID to an existing song
        #cache 
        keyToStore = songNameAndArtist
        spotifyIDForSongsCache[keyToStore] = songID
        
        songs.SpotifyID.iloc[index] = songID


In [22]:
songs["Rank"] = pd.to_numeric(songs["Rank"], downcast = 'integer')
songs = pd.concat([songs,albumSongToAddDF],ignore_index=True) #add the album songs into the song DF
songs = songs[~songs.Title.isin(songsToDeleteDF.Title)].copy() #remove album or song without spotifyID from songs DF
songs.head()

Unnamed: 0,Date,Title,Artist,Rank,SpotifyID
0,2020-01-01,Fine Line,Harry Styles,1,6VzcQuzTNTMFnJ6rBSaLH9
5,2020-01-01,Hollywood's Bleeding,Post Malone,6,7sWRlDoTDX8geTR8zzr2vt
10,2020-01-01,Lover,Taylor Swift,11,1dGr1c8CrMLDpV6mPbImSI
15,2020-01-01,Rudolph The Red-Nosed Reindeer,Burl Ives,16,47otoIkhx3fkdivEXL5OB6
18,2020-01-01,What You See Is What You Get,Luke Combs,19,273TiTHLlHSRe5zrzs7wvD


In [48]:
songs.sort_values(by=["Date", "Rank"],inplace=True, ignore_index = True) #sorting the DF by date and rank

In [50]:
songs.to_pickle('songsWeeklyRanksWithSpotifyID.pkl')