In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from random import randint
from time import sleep
import pandas as pd

In [2]:
# Reading password
secrets_file = open("secrets.txt","r")
string = secrets_file.read()
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

In [3]:
#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
client_secret=secrets_dict['csecret']))

# Handy functions

In [4]:
def get_full_page(playlist):
    # it iterates through each page until the end for huge playlists or albums
    # and returns the whole list of songs 
    results = playlist.copy()
    tracks = results['items']
    while results['next']!=None:
        #sleeping
        wait_time = randint(0,500)/1000
        sleep(wait_time)
        
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3))
    return tracks

In [13]:
def get_album_list(playlist):
    # Gets all the albums from a list of songs by
    # returning a list of tuples (album_name,album_id)
    album_ids = []
    full_playlist = get_full_page(playlist)
    
    for song in full_playlist:
        album_id = (song['track']['album']['name'],song['track']['album']['id'])
        
        if album_id not in album_ids:
            album_ids.append(album_id)
            
    return album_ids

In [14]:
def get_songs_from_album_list(album_list):
    # Iterates through each album and adds all its song to a new dataframe that a new
    # list that is returned
    songs = []
    for album,album_id in album_list:
        #sleeping
        wait_time = randint(0,250)/1000
        sleep(wait_time)
        
        #reading album tracks
        full_album = get_full_page(sp.album_tracks(album_id))
        
        #appending records to songs
        songs = songs + full_album
        
    return songs

## Reading hits between 2020-2022

In [15]:
playlist = sp.user_playlist_tracks("spotify", "1638KZlvcvyyEJ15S8erge")

In [16]:
%%time
full_play = get_full_page(playlist)

CPU times: user 37.6 ms, sys: 8.8 ms, total: 46.4 ms
Wall time: 11.3 s


In [17]:
%%time 
albums = get_album_list(playlist)

CPU times: user 43.3 ms, sys: 9.12 ms, total: 52.4 ms
Wall time: 11.7 s


In [19]:
%%time 
dataset1 = get_songs_from_album_list(albums)

CPU times: user 1.04 s, sys: 188 ms, total: 1.23 s
Wall time: 1min 45s


In [None]:
len(dataset1)

# Reading hits between 2010-2019

In [21]:
playlist = sp.user_playlist_tracks("spotify", "6Pi3jayiuzwmA5i6tLtIap")

In [22]:
%%time
full_play = get_full_page(playlist)

CPU times: user 83.3 ms, sys: 16.1 ms, total: 99.4 ms
Wall time: 29 s


In [23]:
%%time 
albums = get_album_list(playlist)

CPU times: user 98.4 ms, sys: 14.5 ms, total: 113 ms
Wall time: 30.2 s


In [24]:
%%time 
dataset2 = get_songs_from_album_list(albums)

CPU times: user 2.16 s, sys: 382 ms, total: 2.54 s
Wall time: 3min 19s


In [26]:
len(dataset2)

8754

In [46]:
a = {'manuel' : 'madrid', 'fernando' : 'huelva'}

# Creating a datafame

In [32]:
# Concatenating songs from both playlists (and associated albums)
dataset = dataset1+dataset2
len(dataset)

10980

In [33]:
%%time
# Creating a dataframe from the songs
dataframe = pd.DataFrame(columns=['title','artist','uri'])
for i,song in enumerate(dataset):
    dataframe.loc[i,'title'] = song['name']
    dataframe.loc[i,'artist'] = ', '.join([aux['name'] for aux in song['artists']])
    dataframe.loc[i,'uri'] = song['uri']  

CPU times: user 6.56 s, sys: 195 ms, total: 6.76 s
Wall time: 6.86 s


In [39]:
# removing duplicate songs 
dataframe = dataframe.drop_duplicates().reset_index(drop=True)

In [40]:
dataframe

Unnamed: 0,title,artist,uri
0,Hits 2020 - Mashup,"Trinix Remix, Trinix",spotify:track:45v4pDxE0VuSZOx81FtgBS
1,As It Was,Harry Styles,spotify:track:4LRPiXqCikLlN15c3yImP7
2,About Damn Time,Lizzo,spotify:track:1PckUlxKqWQs3RlWXVBLw3
3,Alone Again,The Weeknd,spotify:track:2geUtMB7UX968R11pCoXbU
4,Too Late,The Weeknd,spotify:track:6BSAGCfWNAlx4dVlvxx9UT
...,...,...,...
10790,Alive,Jessie J,spotify:track:2VthUPIyeeE7Xh31Grb609
10791,Unite,Jessie J,spotify:track:7dx1maQh7mN7PHRTnZNFI9
10792,Hero,Jessie J,spotify:track:1bZgbYc9FKJoFQ4mOQl3Pb
10793,Magnetic,Jessie J,spotify:track:0w8VqEISiNmug0Wm4vLyIw


In [41]:
%%time
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe.iterrows():
    #sleeping
    wait_time = randint(0,50)/1000
    sleep(wait_time)
    
    features = sp.audio_features(row['uri'])[0]
    for feature in features:
        dataframe.loc[i,feature] = features[feature]

CPU times: user 41.5 s, sys: 4.35 s, total: 45.9 s
Wall time: 20min 59s


In [42]:
# Taking out some extra columns I don't want
dataframe = dataframe.drop(['type','track_href','analysis_url'],axis=1)

In [43]:
dataframe

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,Hits 2020 - Mashup,"Trinix Remix, Trinix",spotify:track:45v4pDxE0VuSZOx81FtgBS,0.510,0.709,1.0,-7.194,0.0,0.1620,0.128000,0.000015,0.0653,0.6620,142.510,45v4pDxE0VuSZOx81FtgBS,133591.0,5.0
1,As It Was,Harry Styles,spotify:track:4LRPiXqCikLlN15c3yImP7,0.520,0.731,6.0,-5.338,0.0,0.0557,0.342000,0.001010,0.3110,0.6620,173.930,4LRPiXqCikLlN15c3yImP7,167303.0,4.0
2,About Damn Time,Lizzo,spotify:track:1PckUlxKqWQs3RlWXVBLw3,0.836,0.743,10.0,-6.305,0.0,0.0656,0.099500,0.000000,0.3350,0.7220,108.966,1PckUlxKqWQs3RlWXVBLw3,191822.0,4.0
3,Alone Again,The Weeknd,spotify:track:2geUtMB7UX968R11pCoXbU,0.256,0.653,6.0,-7.972,0.0,0.0544,0.033700,0.113000,0.1680,0.0581,90.673,2geUtMB7UX968R11pCoXbU,250057.0,5.0
4,Too Late,The Weeknd,spotify:track:6BSAGCfWNAlx4dVlvxx9UT,0.640,0.869,4.0,-5.179,0.0,0.1170,0.051800,0.000119,0.6910,0.1850,120.070,6BSAGCfWNAlx4dVlvxx9UT,239980.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10790,Alive,Jessie J,spotify:track:2VthUPIyeeE7Xh31Grb609,0.604,0.604,2.0,-10.258,1.0,0.1460,0.001200,0.000060,0.1440,0.7220,157.961,2VthUPIyeeE7Xh31Grb609,204587.0,4.0
10791,Unite,Jessie J,spotify:track:7dx1maQh7mN7PHRTnZNFI9,0.549,0.566,1.0,-8.504,0.0,0.2390,0.014900,0.000000,0.2740,0.3290,105.038,7dx1maQh7mN7PHRTnZNFI9,231200.0,4.0
10792,Hero,Jessie J,spotify:track:1bZgbYc9FKJoFQ4mOQl3Pb,0.227,0.750,11.0,-7.781,1.0,0.0570,0.000156,0.007100,0.0824,0.3550,185.303,1bZgbYc9FKJoFQ4mOQl3Pb,199987.0,4.0
10793,Magnetic,Jessie J,spotify:track:0w8VqEISiNmug0Wm4vLyIw,0.617,0.609,4.0,-8.430,1.0,0.0496,0.006660,0.000000,0.3440,0.6610,139.978,0w8VqEISiNmug0Wm4vLyIw,234507.0,4.0


In [45]:
# I'm saving this because I'd like to skip the waiting time for next time
dataframe.to_csv('name.csv',index=False)