In [1]:
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import spotipy.util as util
from tqdm.notebook import tqdm
from dateutil.parser import isoparse as dateparse
from datetime import datetime, timedelta
import pickle
from cluster_classifier import predict
import hashlib
import numpy as np

In [2]:
class RecentScraper():
    def __init__(self, redirect_uri = "http://localhost:8000", token_path = "ref/api.txt", recent_backup_path = "data/recent_backup.csv", recent_path = "data/recent.csv", playlist_path = 'ref/playlists.data', last_session_path = "data/last_session.data", all_path = "data/all_songs_clustered.csv"):
        #load credentials
        with open(token_path, "r") as f:
            self.username, self.client_id, self.client_secret = [x.strip() for x in f]
        
        #load Spotify client
        scope = "user-read-recently-played playlist-modify-private user-library-read user-top-read user-library-modify user-modify-playback-state streaming"
        token = util.prompt_for_user_token(self.username,scope,self.client_id,self.client_secret,redirect_uri)
        self.sp = spotipy.Spotify(auth=token)
        
        #load playlist uris: only save songs that are from these playlists
        with open(playlist_path, 'rb') as f:
            self.playlist_uris = pickle.load(f)
        
        self.recent_path = recent_path
        self.recent = pd.read_csv(recent_path)
        self.recent_backup_path = recent_backup_path
        
        self.last_session_path = last_session_path
        with open(last_session_path, 'rb') as f:
            data = pickle.load(f)
            self.last_timestamp = int(data['timestamp'])
            self.last_session = data['session']
            
        self.all_path = all_path
        self.all_songs = pd.read_csv(all_path, index_col =  'uri')
        self.audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness','liveness', 'valence', 'tempo']
        
    def _get_track_info(self, track_dict):
        """
        given Spotify's JSON payload with song's audio features, returns dict
        with song features and whether or not song is new (already in all_songs)
            if new, then retrieves audio features and predicts label
            if not, then copies over audio features + label
        """
        track = track_dict['track']
        
        song_data = dict()

        song_data['artist'] = track['artists'][0]['name']
        song_data['title'] = track['name']
        song_data['played_at'] = datetime.fromisoformat(track_dict['played_at'].replace("T", " ").replace("Z", "")) - timedelta(hours = 4)
        uri = track['uri']
        song_data['uri'] = uri
        
        #true if song already in all_songs df
        new = uri not in self.all_songs.index
        
        #if new, retrieve audio features from API and predict label
        if new:
            af = {k:v for (k,v) in self.sp.audio_features(uri)[0].items() if k in self.audio_features}
            label = predict(list(af.values()))
            
        #if not, then copy over audio features + label
        else:
            row = self.all_songs.loc[uri, :]
            af = dict(row[self.audio_features])
            label = row['label']
        

        #add audio features and label to song_data dictionary
        song_data.update(af)
        song_data['label'] = label

        return song_data, new
        
    def scrape(self, n = 50, save = False):

        #filter out songs not played from specified playlists
        raw_recent = self.sp.current_user_recently_played(limit = n, after = self.last_timestamp)['items'][::-1]
        recent = list(filter(lambda x: x['context']['uri'] in self.playlist_uris, raw_recent))
            
        if len(raw_recent) != 0 and len(recent) == 0:
            most_recent = int(datetime.timestamp(datetime.now())*1000)
            print(f"Last session time: {datetime.fromtimestamp(most_recent/1000).strftime('%m-%d %H:%M %p')}")
            print(f"{raw_recent} songs from untracked playlists played")
            with open(self.timestamp_path, 'wb') as f:
                pickle.dump(most_recent, f)
            print(f"Last session time saved to {self.timestamp_path}")
            return
        
        #datetime format
        fmt = "(%m/%d) %H:%M %p"
        n_songs = len(recent)
        
        #load
#         last_session_time = datetime.utcfromtimestamp(self.last_timestamp/1000)
#         prev_time = last_session_time
        prev_time = datetime.utcfromtimestamp(self.last_timestamp/1000)
        
        #if n_songs == 0, then no untracked recent songs
        if n_songs != 0:
            #load previous session id in case new songs played < 20 min after last scrape
            prev_session = self.last_session
            
            #list of dicts of song features
            data = []      
            
            #songs played > 20  min apart are considered separate sessions
            period = timedelta(minutes = 20)
            
            new_songs = []
                
            for track in tqdm(recent):
                row, new = self._get_track_info(track)
                
                if new:
                    #add to list of new songs
                    new_songs.append(f"{row['artist']}: {row['title']}")
                    
                    #update all_songs
                    
                    row_copy = {k:[v] for (k,v) in row.items() if k != 'played_at'}
                    row_copy.update({'pc1':np.nan, 'pc2':np.nan})
                    new_song_df = pd.DataFrame(row_copy, columns = ['uri'] + list(self.all_songs.columns)).set_index('uri')
                    self.all_songs = self.all_songs.append(new_song_df)
                    
                #need time as datetimeobj to check time between songs
                time = row['played_at']
                
                #convert time to specified format for readability in recent df
                str_time = time.strftime(fmt)    
                row['played_at'] = str_time
                  
                #check if elapsed time between this song and previous song > 20 min
                if abs(time - prev_time) > period:
                    #update session id
                    prev_session = hashlib.sha1(str_time.encode('utf-8')).hexdigest()
                    
                row['session'] = prev_session
                
                #update most recent time
                prev_time = time

                data.append(row)

            df = pd.DataFrame(data, columns = self.recent.columns)
            self.data = data
            

            print(f"Songs processed: {n_songs}")
            if len(new_songs) > 0:
                print(f"New songs added to all_songs DataFrame: {new_songs}")
            print(f"Last session time: {prev_time.strftime(fmt)}")
            
            #unix timestamp of last play time
            self.most_recent_timestamp = int(datetime.timestamp(prev_time)*1000)
            
            if save:
                self.recent.to_csv(self.recent_backup_path, index = False) #store old data in backup csv
                self.recent.append(df).to_csv(self.recent_path, index = False) #update old data in recent csv
                with open(self.last_session_path, 'wb') as f:
                    d = {'timestamp': self.most_recent_timestamp, 'session':prev_session}
                    pickle.dump(d, f)
                print(f"Session data saved to {self.recent_path}")
                print(f"Last session time saved to {self.last_session_path}")

            return df
        
        else:
            print(f"No new songs played since {(prev_time-timedelta(hours = 4)).strftime(fmt)}")
            return None

In [3]:
rs = RecentScraper()
rs.scrape(save = True)

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


Songs processed: 16
Last session time: (05/26) 19:23 PM
Session data saved to data/recent.csv
Last session time saved to data/last_session.data


Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
0,Marc E. Bassy,Just My Luck,(05/26) 18:32 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:2QsBAfiNmngcrZsOTznqBG,0.617,0.64,-5.601,0.042,0.283,0.164,0.346,156.98,4
1,Boy In Space,7UP,(05/26) 18:36 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:4cB00WOFuQFLoDpnydcx8c,0.616,0.469,-7.38,0.027,0.0167,0.111,0.233,135.035,4
2,Mike Perry,Lighthouse,(05/26) 18:38 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:4AM3fxgUBkEUO6xScrmB9a,0.543,0.875,-3.033,0.032,0.0795,0.061,0.244,154.936,3
3,TWIN XL,Good,(05/26) 18:41 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:7sRdxkgNb3m3Ciai38ANmd,0.617,0.964,-3.559,0.0355,0.113,0.247,0.961,75.978,0
4,Post Malone,I'm Gonna Be,(05/26) 18:44 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:1ckLp8lCl8LipXI0ypX72m,0.746,0.559,-4.008,0.0316,0.703,0.148,0.368,117.943,2
5,joan,drive all night,(05/26) 18:47 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:75ZKw8JLaFsYr51J44fQ4N,0.542,0.758,-5.032,0.0313,0.000184,0.144,0.43,156.128,3
6,Said the Sky,Erase Me,(05/26) 18:51 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:0AISJtBDtZXgB2fuvcjQEN,0.528,0.61,-6.564,0.0287,0.376,0.211,0.304,130.11,6
7,Lost Frequencies,Are You With Me - Radio Edit,(05/26) 18:53 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:62nhuooamoroYpJyqE0nLd,0.763,0.61,-8.094,0.0355,0.275,0.12,0.392,121.031,4
8,Galantis,Runaway (U & I),(05/26) 18:57 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:46lFttIf5hnUZMGvjK0Wxo,0.506,0.805,-4.119,0.0469,0.00711,0.0856,0.383,126.008,3
9,Said the Sky,All I Got,(05/26) 19:02 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:1iaTDu4PCIWQQOwwwqq5qW,0.325,0.578,-6.786,0.0455,0.158,0.575,0.103,144.745,3


In [5]:
df = pd.read_csv("data/recent.csv")
df.tail(20)

Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
335,Khalid,OTW,(05/25) 20:12 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.678,-6.183,0.0541,0.183,0.106,0.28,72.989,4
336,Boy In Space,7UP,(05/26) 08:32 AM,b8742d0bf900ad4eb0e5a08a849ee7396363e4b2,spotify:track:4cB00WOFuQFLoDpnydcx8c,0.616,0.469,-7.38,0.027,0.0167,0.111,0.233,135.035,4
337,Kid Travis,Strawberry Skies,(05/26) 12:36 PM,7193ad525a76f77a2b1444efd77b4bbea950d254,spotify:track:2p1onANhkUkyMZKKigOUJu,0.65,0.462,-6.589,0.0891,0.366,0.159,0.65,89.887,2
338,Bren Joy,Henny in the Hamptons,(05/26) 13:04 PM,d4d869240c95d67b65c6160411f0a744889d519f,spotify:track:19NKbRV1zevCIzePp88Wzh,0.631,0.634,-6.786,0.115,0.634,0.168,0.479,87.903,2
339,Marc E. Bassy,Just My Luck,(05/26) 18:32 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:2QsBAfiNmngcrZsOTznqBG,0.617,0.64,-5.601,0.042,0.283,0.164,0.346,156.98,4
340,Boy In Space,7UP,(05/26) 18:36 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:4cB00WOFuQFLoDpnydcx8c,0.616,0.469,-7.38,0.027,0.0167,0.111,0.233,135.035,4
341,Mike Perry,Lighthouse,(05/26) 18:38 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:4AM3fxgUBkEUO6xScrmB9a,0.543,0.875,-3.033,0.032,0.0795,0.061,0.244,154.936,3
342,TWIN XL,Good,(05/26) 18:41 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:7sRdxkgNb3m3Ciai38ANmd,0.617,0.964,-3.559,0.0355,0.113,0.247,0.961,75.978,0
343,Post Malone,I'm Gonna Be,(05/26) 18:44 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:1ckLp8lCl8LipXI0ypX72m,0.746,0.559,-4.008,0.0316,0.703,0.148,0.368,117.943,2
344,joan,drive all night,(05/26) 18:47 PM,4bd16a1c612ebfe2afb94ee23ac1cea35bb1d015,spotify:track:75ZKw8JLaFsYr51J44fQ4N,0.542,0.758,-5.032,0.0313,0.000184,0.144,0.43,156.128,3


Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
0,Kid Travis,Strawberry Skies,(05/26) 12:36 PM,7193ad525a76f77a2b1444efd77b4bbea950d254,spotify:track:2p1onANhkUkyMZKKigOUJu,0.65,0.462,-6.589,0.0891,0.366,0.159,0.65,89.887,2
1,Boy In Space,7UP,(05/26) 08:32 AM,b8742d0bf900ad4eb0e5a08a849ee7396363e4b2,spotify:track:4cB00WOFuQFLoDpnydcx8c,0.616,0.469,-7.38,0.027,0.0167,0.111,0.233,135.035,4
2,Khalid,OTW,(05/25) 20:12 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.678,-6.183,0.0541,0.183,0.106,0.28,72.989,4
3,Post Malone,Rich & Sad,(05/25) 19:59 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.571,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
4,The Chainsmokers,Something Just Like This,(05/25) 19:54 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:6RUKPb4LETWmmr3iAEQktW,0.617,0.635,-6.769,0.0317,0.0498,0.164,0.446,103.019,4


In [7]:
rs.most_recent_timestamp

1590496367026

In [6]:
dfb = pd.read_csv("data/recent_backup.csv")

In [7]:
dfb.head()

Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
0,Khalid,OTW,(05/25) 20:12 PM,e2d00c9181a17d94bb774e0f19ad79a05cce6c0d,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.678,-6.183,0.0541,0.183,0.106,0.28,72.989,4
1,Post Malone,Rich & Sad,(05/25) 19:59 PM,e2d00c9181a17d94bb774e0f19ad79a05cce6c0d,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.571,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
2,Khalid,OTW,(05/25) 20:12 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.678,-6.183,0.0541,0.183,0.106,0.28,72.989,4
3,Post Malone,Rich & Sad,(05/25) 19:59 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.571,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
4,The Chainsmokers,Something Just Like This,(05/25) 19:54 PM,d2a04f014f6d12fcb72e387150666fbddfc3dae5,spotify:track:6RUKPb4LETWmmr3iAEQktW,0.617,0.635,-6.769,0.0317,0.0498,0.164,0.446,103.019,4


In [19]:
df.session[0]

4.82707e+18

In [16]:
df.drop_duplicates(['played_at']).reset_index(drop = True).to_csv("data/recent.csv", index = False)

In [9]:
pd.concat([df.loc[]

Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
0,Khalid,OTW,(05/25) 20:12 PM,4827071877810892581,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.6779999999999999,-6.183,0.0541,0.183,0.106,0.28,72.98899999999998,4
1,Post Malone,Rich & Sad,(05/25) 19:59 PM,4827071877810892581,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.5710000000000001,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
2,The Chainsmokers,Something Just Like This,(05/25) 19:54 PM,4827071877810892581,spotify:track:6RUKPb4LETWmmr3iAEQktW,0.617,0.635,-6.769,0.0317,0.0498,0.16399999999999998,0.446,103.019,4
3,Khalid,OTW,(05/25) 20:12 PM,4827071877810892581,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.6779999999999999,-6.183,0.0541,0.183,0.106,0.28,72.98899999999998,4
4,Post Malone,Rich & Sad,(05/25) 19:59 PM,4827071877810892581,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.5710000000000001,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,Valley,Swim - Reprise,(05/08) 19:48 PM,-2878427576433552475,spotify:track:4eVE7ruYH5dNGYKT2jd5So,0.493,0.782,-6.968,0.0353,0.0187,0.321,0.314,104.941,4
342,Post Malone,I'm Gonna Be,(05/08) 19:44 PM,-2878427576433552475,spotify:track:1ckLp8lCl8LipXI0ypX72m,0.746,0.5589999999999999,-4.008,0.0316,0.703,0.14800000000000002,0.368,117.943,2
343,lovelytheband,coachella,(05/08) 19:41 PM,-2878427576433552475,spotify:track:1bPJ6Ay1La8IYPCuCvQA7k,0.563,0.6459999999999999,-6.849,0.0312,0.0664,0.0877,0.28800000000000003,115.013,4
344,88rising,La La Lost You,(05/08) 19:37 PM,-2878427576433552475,spotify:track:5Htq8DDDN0Xce9MwqzAzui,0.721,0.5820000000000001,-8.251,0.0411,0.315,0.065,0.487,100.083,2
