In [47]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from random import randint
from time import sleep
import pandas as pd
import math

In [6]:
# Reading password
secrets_file = open("secrets.txt","r")
string = secrets_file.read()
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

In [7]:
#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
client_secret=secrets_dict['csecret']))

# Handy functions

In [8]:
def get_full_page(playlist):
    # it iterates through each page until the end for huge playlists or albums
    # and returns the whole list of songs 
    results = playlist.copy()
    tracks = results['items']
    while results['next']!=None:
        #sleeping
        wait_time = randint(0,500)/1000
        sleep(wait_time)
        
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3))
    return tracks

In [9]:
def get_album_list(playlist):
    # Gets all the albums from a list of songs by
    # returning a list of tuples (album_name,album_id)
    album_ids = []
    full_playlist = get_full_page(playlist)
    
    for song in full_playlist:
        album_id = (song['track']['album']['name'],song['track']['album']['id'])
        
        if album_id not in album_ids:
            album_ids.append(album_id)
            
    return album_ids

In [10]:
def get_songs_from_album_list(album_list):
    # Iterates through each album and adds all its song to a new dataframe that a new
    # list that is returned
    songs = []
    for album,album_id in album_list:
        #sleeping
        wait_time = randint(0,250)/1000
        sleep(wait_time)
        
        #reading album tracks
        full_album = get_full_page(sp.album_tracks(album_id))
        
        #appending records to songs
        songs = songs + full_album
        
    return songs

In [88]:
def done_alarm():
    import beepy
    beepy.beep(sound=1)
    beepy.beep(sound=1)
    beepy.beep(sound=1)
    beepy.beep(sound=1)
    beepy.beep(sound=1)
    beepy.beep(sound=5)
    return

## Reading hits between 2020-2022

In [16]:
playlist = sp.user_playlist_tracks("spotify", "1638KZlvcvyyEJ15S8erge")

In [17]:
%%time
full_play = get_full_page(playlist)

CPU times: user 91.5 ms, sys: 13.2 ms, total: 105 ms
Wall time: 10.8 s


In [18]:
%%time 
albums = get_album_list(playlist)

CPU times: user 42.6 ms, sys: 4.32 ms, total: 46.9 ms
Wall time: 6.34 s


In [19]:
%%time 
dataset1 = get_songs_from_album_list(albums)

CPU times: user 1.11 s, sys: 193 ms, total: 1.3 s
Wall time: 1min 46s


In [20]:
len(dataset1)

2226

# Reading hits between 2010-2019

In [21]:
playlist = sp.user_playlist_tracks("spotify", "6Pi3jayiuzwmA5i6tLtIap")

In [22]:
%%time
full_play = get_full_page(playlist)

CPU times: user 77.6 ms, sys: 10.3 ms, total: 87.9 ms
Wall time: 27.4 s


In [23]:
%%time 
albums = get_album_list(playlist)

CPU times: user 87.1 ms, sys: 9.46 ms, total: 96.5 ms
Wall time: 28.2 s


In [24]:
%%time 
dataset2 = get_songs_from_album_list(albums)

CPU times: user 2.17 s, sys: 364 ms, total: 2.53 s
Wall time: 3min 18s


In [25]:
len(dataset2)

8754

# Reading longest playlist ever

In [11]:
playlist = sp.user_playlist_tracks("spotify", "6FKDzNYZ8IW1pvYVF4zUN2")

In [12]:
%%time
full_play = get_full_page(playlist)

CPU times: user 957 ms, sys: 149 ms, total: 1.11 s
Wall time: 4min 19s


In [13]:
%%time 
albums = get_album_list(playlist)

CPU times: user 2.12 s, sys: 172 ms, total: 2.29 s
Wall time: 4min 12s


In [14]:
%%time 
dataset3 = get_songs_from_album_list(albums)

CPU times: user 27.1 s, sys: 3.59 s, total: 30.7 s
Wall time: 32min 28s


In [15]:
len(dataset3)

75215

# Creating a datafame

In [26]:
# Concatenating songs from both playlists (and associated albums)
dataset = dataset1+dataset2+dataset3
len(dataset)

86195

In [29]:
%%time
# Creating a dataframe from the songs
dataframe = pd.DataFrame(columns=['title','artist','uri'])
for i,song in enumerate(dataset):
    dataframe.loc[i,'title'] = song['name']
    dataframe.loc[i,'artist'] = ', '.join([aux['name'] for aux in song['artists']])
    dataframe.loc[i,'uri'] = song['uri']  

CPU times: user 7min 26s, sys: 28 s, total: 7min 54s
Wall time: 7min 55s


In [30]:
# removing duplicate songs 
dataframe = dataframe.drop_duplicates().reset_index(drop=True)

In [77]:
dataframe

Unnamed: 0,title,artist,uri
0,Hits 2020 - Mashup,"Trinix Remix, Trinix",spotify:track:45v4pDxE0VuSZOx81FtgBS
1,As It Was,Harry Styles,spotify:track:4LRPiXqCikLlN15c3yImP7
2,About Damn Time,Lizzo,spotify:track:1PckUlxKqWQs3RlWXVBLw3
3,Alone Again,The Weeknd,spotify:track:2geUtMB7UX968R11pCoXbU
4,Too Late,The Weeknd,spotify:track:6BSAGCfWNAlx4dVlvxx9UT
...,...,...,...
83728,Sunny Afternoon,Red Velvet,spotify:track:3G6iqKcRzEYwSu9zE2qAMW
83729,Fool,Red Velvet,spotify:track:21C7B7PWYDW00czamOjleO
83730,Some Love,Red Velvet,spotify:track:5n8SuNZuDaElJQMt0ySzCf
83731,My Dear,Red Velvet,spotify:track:6s25gIYsBc8oF0VQFK7wzt


# Dividing the dataframe
because the dataset is too big

In [78]:
divided = []
div = 20000

for i in range(math.ceil(len(dataframe)/div)):
    divided.append(dataframe.loc[i*div:((i+1)*div-1)])
len(divided)

5

#### Dataframe 0

In [85]:
%%time
dataframe0 = divided[0].copy()
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe0.iterrows():
    features = sp.audio_features(row['uri'])[0]
    for feature in features:
        dataframe0.loc[i,feature] = features[feature]

CPU times: user 1min 35s, sys: 8.8 s, total: 1min 44s
Wall time: 32min 16s


In [89]:
done_alarm()

#### Dataframe 1

In [105]:
%%time
dataframe1 = divided[1].copy()
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe1.iterrows():
    #sleeping
    wait_time = randint(0,100)/1000
    sleep(wait_time)
    features = sp.audio_features(row['uri'])[0]
    for feature in features:
        dataframe1.loc[i,feature] = features[feature]

CPU times: user 1min 35s, sys: 11.5 s, total: 1min 47s
Wall time: 36min 36s


In [106]:
done_alarm()

#### Dataframe 2

In [136]:
%%time
dataframe2 = divided[2].copy()
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe2.iterrows():
    try:
        features = sp.audio_features(row['uri'])[0]
        
        for feature in features:
            dataframe2.loc[i,feature] = features[feature]
    except:
        # if spotify can't give us its audio qualities, drop the song from the database
        dataframe2 = dataframe2.drop([i],axis=0)

CPU times: user 1min 48s, sys: 8.84 s, total: 1min 57s
Wall time: 32min 13s


In [137]:
done_alarm()

#### Dataframe 3

In [141]:
%%time
dataframe3 = divided[3].copy()
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe3.iterrows():
    try:
        features = sp.audio_features(row['uri'])[0]
        
        for feature in features:
            dataframe3.loc[i,feature] = features[feature]
    except:
        # if spotify can't give us its audio qualities, drop the song from the database
        dataframe3 = dataframe3.drop([i],axis=0)

CPU times: user 2min 9s, sys: 9.82 s, total: 2min 19s
Wall time: 32min 33s


In [144]:
done_alarm()

#### Dataframe 4

In [145]:
%%time
dataframe4 = divided[4].copy()
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe4.iterrows():
    try:
        features = sp.audio_features(row['uri'])[0]
        
        for feature in features:
            dataframe4.loc[i,feature] = features[feature]
    except:
        # if spotify can't give us its audio qualities, drop the song from the database
        dataframe4 = dataframe4.drop([i],axis=0)

CPU times: user 20.5 s, sys: 1.7 s, total: 22.2 s
Wall time: 6min 5s


In [170]:
done_alarm()

pandas.core.frame.DataFrame

#### Concatenating everything back

In [176]:
dataframe = pd.concat([dataframe0,dataframe1,dataframe2,dataframe3,dataframe4],axis=0).reset_index(drop=True)

In [178]:
dataframe.head()

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,track_href,analysis_url,duration_ms,time_signature
0,Hits 2020 - Mashup,"Trinix Remix, Trinix",spotify:track:45v4pDxE0VuSZOx81FtgBS,0.51,0.709,1.0,-7.194,0.0,0.162,0.128,1.5e-05,0.0653,0.662,142.51,audio_features,45v4pDxE0VuSZOx81FtgBS,https://api.spotify.com/v1/tracks/45v4pDxE0VuS...,https://api.spotify.com/v1/audio-analysis/45v4...,133591.0,5.0
1,As It Was,Harry Styles,spotify:track:4LRPiXqCikLlN15c3yImP7,0.52,0.731,6.0,-5.338,0.0,0.0557,0.342,0.00101,0.311,0.662,173.93,audio_features,4LRPiXqCikLlN15c3yImP7,https://api.spotify.com/v1/tracks/4LRPiXqCikLl...,https://api.spotify.com/v1/audio-analysis/4LRP...,167303.0,4.0
2,About Damn Time,Lizzo,spotify:track:1PckUlxKqWQs3RlWXVBLw3,0.836,0.743,10.0,-6.305,0.0,0.0656,0.0995,0.0,0.335,0.722,108.966,audio_features,1PckUlxKqWQs3RlWXVBLw3,https://api.spotify.com/v1/tracks/1PckUlxKqWQs...,https://api.spotify.com/v1/audio-analysis/1Pck...,191822.0,4.0
3,Alone Again,The Weeknd,spotify:track:2geUtMB7UX968R11pCoXbU,0.256,0.653,6.0,-7.972,0.0,0.0544,0.0337,0.113,0.168,0.0581,90.673,audio_features,2geUtMB7UX968R11pCoXbU,https://api.spotify.com/v1/tracks/2geUtMB7UX96...,https://api.spotify.com/v1/audio-analysis/2geU...,250057.0,5.0
4,Too Late,The Weeknd,spotify:track:6BSAGCfWNAlx4dVlvxx9UT,0.64,0.869,4.0,-5.179,0.0,0.117,0.0518,0.000119,0.691,0.185,120.07,audio_features,6BSAGCfWNAlx4dVlvxx9UT,https://api.spotify.com/v1/tracks/6BSAGCfWNAlx...,https://api.spotify.com/v1/audio-analysis/6BSA...,239980.0,4.0


In [43]:
# Taking out some extra columns I don't want
dataframe = dataframe.drop(['type','track_href','analysis_url'],axis=1)

In [44]:
dataframe

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,Hits 2020 - Mashup,"Trinix Remix, Trinix",spotify:track:45v4pDxE0VuSZOx81FtgBS,0.510,0.709,1.0,-7.194,0.0,0.1620,0.1280,0.000015,0.0653,0.6620,142.510,45v4pDxE0VuSZOx81FtgBS,133591.0,5.0
1,As It Was,Harry Styles,spotify:track:4LRPiXqCikLlN15c3yImP7,0.520,0.731,6.0,-5.338,0.0,0.0557,0.3420,0.001010,0.3110,0.6620,173.930,4LRPiXqCikLlN15c3yImP7,167303.0,4.0
2,About Damn Time,Lizzo,spotify:track:1PckUlxKqWQs3RlWXVBLw3,0.836,0.743,10.0,-6.305,0.0,0.0656,0.0995,0.000000,0.3350,0.7220,108.966,1PckUlxKqWQs3RlWXVBLw3,191822.0,4.0
3,Alone Again,The Weeknd,spotify:track:2geUtMB7UX968R11pCoXbU,0.256,0.653,6.0,-7.972,0.0,0.0544,0.0337,0.113000,0.1680,0.0581,90.673,2geUtMB7UX968R11pCoXbU,250057.0,5.0
4,Too Late,The Weeknd,spotify:track:6BSAGCfWNAlx4dVlvxx9UT,0.640,0.869,4.0,-5.179,0.0,0.1170,0.0518,0.000119,0.6910,0.1850,120.070,6BSAGCfWNAlx4dVlvxx9UT,239980.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83728,Sunny Afternoon,Red Velvet,spotify:track:3G6iqKcRzEYwSu9zE2qAMW,,,,,,,,,,,,,,
83729,Fool,Red Velvet,spotify:track:21C7B7PWYDW00czamOjleO,,,,,,,,,,,,,,
83730,Some Love,Red Velvet,spotify:track:5n8SuNZuDaElJQMt0ySzCf,,,,,,,,,,,,,,
83731,My Dear,Red Velvet,spotify:track:6s25gIYsBc8oF0VQFK7wzt,,,,,,,,,,,,,,


In [179]:
# I'm saving this because I'd like to skip the waiting time for next time
dataframe.to_csv('songs90k.csv',index=False)

In [40]:
len(dataset)

86195

#### old code

In [None]:
"""%%time
# Reading spotify audio features for each song and adding it to our dataframe
for i,row in dataframe.iterrows():
    #sleeping
    wait_time = randint(0,500)/1000
    sleep(wait_time)
    
    features = sp.audio_features(row['uri'])[0]
    for feature in features:
        dataframe.loc[i,feature] = features[feature]"""