In [1]:
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import spotipy.util as util
from tqdm.notebook import tqdm
from dateutil.parser import isoparse as dateparse
from datetime import datetime, timedelta
import pickle
from cluster_classifier import predict
import hashlib

In [2]:
class RecentScraper():
    def __init__(self, redirect_uri = "http://localhost:8000", token_path = "ref/api.txt", recent_path = "data/recent.csv", playlist_path = 'ref/playlists.data', timestamp_path = "data/timestamp.data", all_path = "data/all_songs_clustered.csv"):
        #load credentials
        with open(token_path, "r") as f:
            self.username, self.client_id, self.client_secret = [x.strip() for x in f]
        
        #load Spotify client
        scope = "user-read-recently-played playlist-modify-private user-library-read user-top-read user-library-modify user-modify-playback-state streaming"
        token = util.prompt_for_user_token(self.username,scope,self.client_id,self.client_secret,redirect_uri)
        self.sp = spotipy.Spotify(auth=token)
        
        #load playlist uris: only save songs that are from these playlists
        with open(playlist_path, 'rb') as f:
            self.playlist_uris = pickle.load(f)
        
        self.recent_path = recent_path
        self.recent = pd.read_csv(recent_path)
        
        self.timestamp_path = timestamp_path
        with open(timestamp_path, 'rb') as f:
            self.timestamp = int(pickle.load(f))
        
        self.all_path = all_path
        self.all_songs = pd.read_csv(all_path, index_col =  'uri')
        self.audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness','liveness', 'valence', 'tempo']
        
    def _get_track_info(self, track_dict):
        """
        given Spotify's JSON payload with song's audio features, returns dict
        with song features and whether or not song is new (already in all_songs)
            if new, then retrieves audio features and predicts label
            if not, then copies over audio features + label
        """
        track = track_dict['track']
        
        song_data = dict()

        song_data['artist'] = track['artists'][0]['name']
        song_data['title'] = track['name']
        song_data['played_at'] = datetime.fromisoformat(track_dict['played_at'].replace("T", " ").replace("Z", "")) - timedelta(hours = 4)
        uri = track['uri']
        song_data['uri'] = uri
        
        #true if song already in all_songs df
        new = uri not in self.all_songs.index
        
        #if new, retrieve audio features from API and predict label
        if new:
            af = {k:v for (k,v) in self.sp.audio_features(uri)[0].items() if k in self.audio_features}
            label = predict(list(af.values()))
            
        #if not, then copy over audio features + label
        else:
            row = self.all_songs.loc[uri, :]
            af = dict(row[self.audio_features])
            label = row['label']
        

        #add audio features and label to song_data dictionary
        song_data.update(af)
        song_data['label'] = label

        return song_data, new
        
    def scrape(self, n = 50, save = False):

        #filter out songs not played from specified playlists
        recent = self.sp.current_user_recently_played(limit = n, after = self.timestamp)['items']
        recent = list(filter(lambda x: x['context']['uri'] in self.playlist_uris, recent))        

        #datetime format
        fmt = "(%m/%d) %H:%M %p"
        n_songs = len(recent)
        
        last_session_time = datetime.utcfromtimestamp(self.timestamp/1000)
        prev_time = last_session_time
        
        #if n_songs == 0, then no untracked recent songs
        if n_songs != 0:
            #placeholder for session id, first song in recent should overwrite prev_session anyways
            prev_session = None
            
            #list of dicts of song features
            data = []      
            
            #songs played > 20  min apart are considered separate sessions
            period = timedelta(minutes = 20)
            
            new_songs = []
                
            for track in tqdm(recent):
                row, new = self._get_track_info(track)
                
                if new:
                    #add to list of new songs
                    new_songs.append(f"{row['artist']}: {row['title']}")
                    
                    #update all_songs
                    
                    row_copy = {k:[v] for (k,v) in row.items() if k != 'played_at'}
                    new_song_df = pd.DataFrame(row_copy).set_index('uri')
                    self.all_songs = self.all_songs.append(new_song_df)
                    
                #need time as datetimeobj to check time between songs
                time = row['played_at']
                
                #convert time to specified format for readability in recent df
                str_time = time.strftime(fmt)    
                row['played_at'] = str_time
                  
                #check if elapsed time between this song and previous song > 20 min
                if abs(time - prev_time) > period:
                    #update session id
                    prev_session = hashlib.sha1(str_time.encode('utf-8')).hexdigest()
                    
                row['session'] = prev_session
                
                #update most recent time
                prev_time = time

                data.append(row)

            df = pd.DataFrame(data, columns = ['artist', 'title', 'played_at', 'session', 'uri'] + self.audio_features + ['label'])
            self.data = data
            

            print(f"Songs processed: {n_songs}")
            if len(new_songs) > 0:
                print(f"New songs: {new_songs}")
            print(f"Last session time: {prev_time.strftime(fmt)}")
            
            #unix timestamp of last play time
            self.most_recent_timestamp = int(datetime.timestamp(prev_time)*1000)
            
            if save:
                df.append(self.recent).to_csv(self.recent_path, index = False)
                with open(self.timestamp_path, 'wb') as f:
                    pickle.dump(self.most_recent_timestamp, f)
                print(f"Session data saved to {self.recent_path}")
                print(f"Last session time saved to {self.timestamp_path}")

            return df
        
        else:
            print(f"No new songs played since {last_session_time.strftime(fmt)}")
            return None

In [6]:
rs = RecentScraper()
rs.scrape(save = True)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Songs processed: 3
Last session time: (05/25) 19:54 PM
Session data saved to data/recent.csv
Last session time saved to data/timestamp.data


Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
0,Khalid,OTW,(05/25) 20:12 PM,4827071877810892581,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.678,-6.183,0.0541,0.183,0.106,0.28,72.989,4
1,Post Malone,Rich & Sad,(05/25) 19:59 PM,4827071877810892581,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.571,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
2,The Chainsmokers,Something Just Like This,(05/25) 19:54 PM,4827071877810892581,spotify:track:6RUKPb4LETWmmr3iAEQktW,0.617,0.635,-6.769,0.0317,0.0498,0.164,0.446,103.019,4


In [17]:
df = pd.read_csv("data/recent.csv")

In [19]:
df.session[0]

4.82707e+18

In [16]:
df.drop_duplicates(['played_at']).reset_index(drop = True).to_csv("data/recent.csv", index = False)

In [9]:
pd.concat([df.loc[]

Unnamed: 0,artist,title,played_at,session,uri,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,label
0,Khalid,OTW,(05/25) 20:12 PM,4827071877810892581,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.6779999999999999,-6.183,0.0541,0.183,0.106,0.28,72.98899999999998,4
1,Post Malone,Rich & Sad,(05/25) 19:59 PM,4827071877810892581,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.5710000000000001,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
2,The Chainsmokers,Something Just Like This,(05/25) 19:54 PM,4827071877810892581,spotify:track:6RUKPb4LETWmmr3iAEQktW,0.617,0.635,-6.769,0.0317,0.0498,0.16399999999999998,0.446,103.019,4
3,Khalid,OTW,(05/25) 20:12 PM,4827071877810892581,spotify:track:6Hgh47WXVKtXN5zGOu0hjI,0.652,0.6779999999999999,-6.183,0.0541,0.183,0.106,0.28,72.98899999999998,4
4,Post Malone,Rich & Sad,(05/25) 19:59 PM,4827071877810892581,spotify:track:2VdT56BGpdqNHUgOe1j5vc,0.599,0.5710000000000001,-4.998,0.0665,0.198,0.0969,0.285,151.974,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,Valley,Swim - Reprise,(05/08) 19:48 PM,-2878427576433552475,spotify:track:4eVE7ruYH5dNGYKT2jd5So,0.493,0.782,-6.968,0.0353,0.0187,0.321,0.314,104.941,4
342,Post Malone,I'm Gonna Be,(05/08) 19:44 PM,-2878427576433552475,spotify:track:1ckLp8lCl8LipXI0ypX72m,0.746,0.5589999999999999,-4.008,0.0316,0.703,0.14800000000000002,0.368,117.943,2
343,lovelytheband,coachella,(05/08) 19:41 PM,-2878427576433552475,spotify:track:1bPJ6Ay1La8IYPCuCvQA7k,0.563,0.6459999999999999,-6.849,0.0312,0.0664,0.0877,0.28800000000000003,115.013,4
344,88rising,La La Lost You,(05/08) 19:37 PM,-2878427576433552475,spotify:track:5Htq8DDDN0Xce9MwqzAzui,0.721,0.5820000000000001,-8.251,0.0411,0.315,0.065,0.487,100.083,2
