In [1]:
# Import all connection libraries

from spotipy import SpotifyClientCredentials
import spotipy
import sqlalchemy as sql

# Import all data manipulation libraries

import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm

# Import Libraries to Refresh Access Token
import time
import os

pandarallel.initialize(verbose=0, nb_workers=8)

In [2]:
# Delete cached Spotify access token if an hour has passed since last access.
with open("timeCacheLastAccessed.txt", "r", encoding="UTF-8") as f:
    timeLastAccessed = float(f.readline())
if (time.time() - timeLastAccessed >= 3600):  # If an hour has passed.
    os.remove(".cache")
    with open("timeCacheLastAccessed.txt", "w", encoding="UTF-8") as f:
        f.write(str(time.time()))

# establish connection to Spotify API

cid = 'e5448a8a4fdc4b5d98b44e956d50546d'
secret = '8924c0394d3f49a4a569fc03e891aa1b'
client_credentials = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials, requests_timeout=15, retries=10)

engine = sql.create_engine('postgresql+psycopg2://postgres:DataNerd2023!!\
@localhost/spotify')

client_credentials.get_access_token(as_dict=False)  # Retreives new access token if necessary.

'BQBDQDYI-7ueeIIkrEOy9rToXTG7Xpvgia_YuWNyPSax-6C0a139MsU0KgJvHTyFHBe3ef-Aw5OkkQd2_AZ1vQmMe2LBXL9mP0CXICUQVibol3Od4YXO'

In [3]:
from functools import lru_cache

# Use the lru_cache decorator to cache the result of the function
# maxsize=1000 means the cache will store the result of up to 1000 items
@lru_cache(maxsize=1000)
def get_playlist_tracks(playlist_uri):
    """
    This function uses the Spotify API to get the tracks of a given playlist.
    The function is decorated with the lru_cache decorator to cache the result for each unique playlist_uri.
    So, if the same playlist_uri is passed to the function again, the cached result will be returned
    instead of making a new API call.
    :param playlist_uri: The Spotify URI of the playlist
    :return: A DataFrame containing the tracks of the playlist
    """
    # Use the Spotify API to get the tracks of the playlist
    tracks = pd.json_normalize(sp.playlist_tracks(playlist_uri), record_path=['items'])
    return tracks

In [4]:
# load training data

with pd.option_context("mode.dtype_backend", "pyarrow"):
    new_batch = pd.read_csv('C:\\Users\\Chase\\OneDrive\\Documents\\temp\\UVU-2022-2023\\distinct_playlists_new.csv', engine='pyarrow')[['playlist_uri', 'playlist_name']]
    new_batch['playlist_uri'] = new_batch['playlist_uri'].str.strip()

    db_query = pd.read_sql('SELECT DISTINCT playlist_uri FROM playlist_tracks', engine)

    outer = new_batch.merge(db_query, how='outer', indicator=True)
    anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

    new_batch = pd.DataFrame(anti_join)
new_batch.head(10)

Unnamed: 0,playlist_uri,playlist_name
0,spotify:playlist:5IMpQdB7rXdmMTuzhlhPZ4,69 HYPE
1,spotify:playlist:2e8iff2Rxt2BP0VpgJpVfj,Touhou OST (6-18)
2,spotify:playlist:7GTg7UFb2U2lsSZTfEwDQz,Valerie - '68 Version
3,spotify:playlist:7tvMtjQOPd08XgOBK6K3aE,MORNING SEX
4,spotify:playlist:5T8y1iYFXgo0FNkYax3XkT,AK-69
5,spotify:playlist:5T8y1iYFXgo0FNkYax3XkT,AK-69
6,spotify:playlist:4ubFrmZ99vpCucevPNWRPQ,"Happy 60s, 70s, 80s"
7,spotify:playlist:7g61XIrplOs8Kiyn7CsIbN,sex playlist 😫
8,spotify:playlist:5saOtgQGmyweUq8aKmvGVR,Norske 60-talls hits
9,spotify:playlist:3vlpRGlGVLnLHR1ZlnqSeH,60-luvun suomi-iskelmiä


In [5]:
def load_all_data():
    with pd.option_context("mode.dtype_backend", "pyarrow"):
        # load training data
        new_batch = pd.read_csv('C:\\Users\\Chase\\OneDrive\\Documents\\UVU-2022-2023\\distinct_playlists_new.csv', engine='pyarrow')[['playlist_uri', 'playlist_name']]
        new_batch['playlist_uri'] = new_batch['playlist_uri'].str.strip()

        db_query = pd.read_sql('SELECT DISTINCT playlist_uri FROM playlist_tracks', engine)

        outer = new_batch.merge(db_query, how='outer', indicator=True)
        anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

        new_batch = pd.DataFrame(anti_join)
        new_batch = new_batch[0:50]
        
        # extract all tracks in playlists
        load_batch = []
        series = new_batch['playlist_uri'].to_dict()
        for playlist in tqdm(series.values()):
                try:
                        tracks = get_playlist_tracks(playlist)
                        if len(tracks) >= 100:
                                tracks2 = pd.json_normalize(sp.playlist_tracks(playlist, offset=100), record_path=['items'])
                                tracks3 = pd.json_normalize(sp.playlist_tracks(playlist, offset=200), record_path=['items'])
                                tracks = pd.concat([tracks, tracks2, tracks3])
                                secondary_data = pd.json_normalize(sp.playlist(playlist))
                                tracks['playlist_uri'] = playlist
                                tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
                                tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri", "added_by.external_urls.spotify": "added_by_external_urls_spotify", 'added_by.href':"added_by_href", "added_by.id":"added_by_id", "added_by.type":"added_by_type", "added_by.uri":"added_by_uri", "track.album.album_type":"track_album_album_type", "track.album.external_urls.spotify":"track_album_external_urls_spotify", "track.album.href":"track_album_href", "track.album.id":"track_album_id", "track.album.release_date_precision":"track_album_release_date_precision", "track.album.total_tracks":"track_album_total_tracks","track.album.type":"track_album_type", "track.disc_number":"track_disc_number", "track.episode":"track_episode", "track.external_ids.isrc":"track_external_ids_isrc", "track.external_urls.spotify":"track_external_ids_spotify", "track.href":"track_href", "track.id":"track_id", "track.is_local":"track_is_local", "track.popularity":"track_popularity", "track.preview_url":"track_preview_url", "track.track":"track_track", "track.track_number":"track_track_number", "track.type":"track_type", "video_thumbnail.url":"video_thumbnail_url", "external_urls.spotify":"external_urls_spotify", "followers.href":"followers_href", "followers.total":"followers_total", "owner.display_name":"owner_display_name", "owner.external_urls.spotify":"owner_external_urls_spotify", "owner.href":"owner_href", "owner.id":"owner_id", "owner.type":"owner_type", "owner.uri":"owner_uri", "tracks.href":"tracks_href", "tracks.limit":"tracks_limit", "tracks.next":"tracks_next", "tracks.offset":"tracks_offset", "tracks.previous":"tracks_previous", "tracks.total":"tracks_total"})
                                tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])

                                load_batch.append(tracks)
                        else:
                                secondary_data = pd.json_normalize(sp.playlist(playlist))
                                tracks['playlist_uri'] = playlist
                                tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
                                tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri", "added_by.external_urls.spotify": "added_by_external_urls_spotify", 'added_by.href':"added_by_href", "added_by.id":"added_by_id", "added_by.type":"added_by_type", "added_by.uri":"added_by_uri", "track.album.album_type":"track_album_album_type", "track.album.external_urls.spotify":"track_album_external_urls_spotify", "track.album.href":"track_album_href", "track.album.id":"track_album_id", "track.album.release_date_precision":"track_album_release_date_precision", "track.album.total_tracks":"track_album_total_tracks","track.album.type":"track_album_type", "track.disc_number":"track_disc_number", "track.episode":"track_episode", "track.external_ids.isrc":"track_external_ids_isrc", "track.external_urls.spotify":"track_external_ids_spotify", "track.href":"track_href", "track.id":"track_id", "track.is_local":"track_is_local", "track.popularity":"track_popularity", "track.preview_url":"track_preview_url", "track.track":"track_track", "track.track_number":"track_track_number", "track.type":"track_type", "video_thumbnail.url":"video_thumbnail_url", "external_urls.spotify":"external_urls_spotify", "followers.href":"followers_href", "followers.total":"followers_total", "owner.display_name":"owner_display_name", "owner.external_urls.spotify":"owner_external_urls_spotify", "owner.href":"owner_href", "owner.id":"owner_id", "owner.type":"owner_type", "owner.uri":"owner_uri", "tracks.href":"tracks_href", "tracks.limit":"tracks_limit", "tracks.next":"tracks_next", "tracks.offset":"tracks_offset", "tracks.previous":"tracks_previous", "tracks.total":"tracks_total"})
                                tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])
                                load_batch.append(tracks)
                except:
                        pass
        load_batch = pd.concat(load_batch)
        if set(['track.album.is_playable','track.album.restrictions.reason', 'track.external_ids.spotify', 'track.album.album_group']).issubset(load_batch.columns):
                load_batch = load_batch.drop(columns=['track.album.is_playable', 'track.album.restrictions.reason', 'track.external_ids.spotify', 'track.is_playable', 'track.album.album_group'])
        else:
                load_batch
        load_batch.to_sql('playlist_tracks', engine, if_exists='append')
        print(f'playlist_tracks updated #{i}')
        
        # extract all artists
        # query = """SELECT DISTINCT playlist_tracks.track_uri, playlist_tracks.track_name, artists.artist1, artists.artist2 FROM artists RIGHT JOIN playlist_tracks ON artists.track_uri = playlist_tracks.track_uri WHERE artist1 IS NULL AND LEFT(playlist_tracks.track_uri, 13) = 'spotify:track' ORDER BY playlist_tracks.track_uri, artist1 LIMIT 500"""
        # df = pd.read_sql(query, engine)

        # df = pd.Series(df['track_uri'])
        # base_list = []
        # for track in tqdm(df):
        #         try:
        #                 df2 = pd.json_normalize(sp.track(track), record_path=['artists'])
        #                 df2['track_uri'] = track
        #                 df2 = df2[['name', 'track_uri']]
        #                 base_list.append(df2)
        #         except:
        #                 pass
        # df2 = pd.concat(base_list)
        
        # df2['RN'] = df2.groupby("track_uri")["name"].rank(method="first", ascending=True)
        # df2 = df2.pivot(index='track_uri', columns=['RN'], values='name').reset_index()
        # df2 = df2.rename(columns={1.0:'artist1', 2.0:'artist2', 3.0:'artist3', 4.0:'artist4', 5.0:'artist5'})
        # try:
        #         df2 = df2[['track_uri', 'artist1', 'artist2', 'artist3', 'artist4', 'artist5']]
        # except(KeyError):
        #         df2 = df2[['track_uri', 'artist1', 'artist2']]
        
        # df2.to_sql('artists', engine, if_exists='append')
        # print(f'artists updated #{i}')
        
        # # extract all audio_features
        # query = """SELECT DISTINCT playlist_tracks.track_uri, playlist_tracks.track_name, audio_features.danceability FROM audio_features RIGHT JOIN playlist_tracks ON audio_features.track_uri = playlist_tracks.track_uri WHERE danceability IS NULL AND LEFT(playlist_tracks.track_uri,13) = 'spotify:track' ORDER BY playlist_tracks.track_uri, danceability LIMIT 500"""
        # df = pd.read_sql(query, engine)

        # df = pd.Series(df['track_uri'])

        # base_list = []
        # for track in tqdm(df):
        #         try:
        #                 df3 = pd.json_normalize(sp.audio_features(track))
        #                 df3['track_uri'] = track
        #                 df3 = df3[['track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
        #                 base_list.append(df3)
        #         except:
        #                 pass
        # df3 = pd.concat(base_list)
        
        # df3.to_sql('audio_features', con=engine, if_exists='append')
        # print(f'audio_features updated #{i}')

In [6]:
i = 1

while i <= 8:
    load_all_data()
    i += 1

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
all_data1 = pd.read_sql('''SELECT DISTINCT playlist_uri, playlist_name, owner_uri, owner_display_name, collaborative, description, followers_total,
af.track_uri, track_name, artist1, artist2, artist3, artist4, artist5, album_uri, album_name, release_date, 
added_by_uri, added_at, track_album_total_tracks, af.duration_ms, "isExplicit", track_popularity, 
track_preview_url, video_thumbnail_url, danceability, energy, loudness, key, mode, acousticness, speechiness, 
instrumentalness, liveness, valence, tempo, time_signature
FROM playlist_tracks pt
JOIN artists a ON pt.track_uri = a.track_uri
JOIN audio_features af ON pt.track_uri = af.track_uri
ORDER BY playlist_uri, track_uri
LIMIT 1894425;''', engine)
f = open("C:\\Users\\Chase\\OneDrive\\Documents\\Career Development\\Data Science Club\\Spring 2023 Club Project\\oltp_output1.csv", "w")
f.truncate()
f.close()

all_data1.to_csv('C:\\Users\\Chase\\OneDrive\\Documents\\Career Development\\Data Science Club\\Spring 2023 Club Project\\oltp_output1.csv', mode="w+", index=False)

In [None]:
all_data2 = pd.read_sql('''SELECT DISTINCT playlist_uri, playlist_name, owner_uri, owner_display_name, collaborative, description, followers_total,
af.track_uri, track_name, artist1, artist2, artist3, artist4, artist5, album_uri, album_name, release_date, 
added_by_uri, added_at, track_album_total_tracks, af.duration_ms, "isExplicit", track_popularity, 
track_preview_url, video_thumbnail_url, danceability, energy, loudness, key, mode, acousticness, speechiness, 
instrumentalness, liveness, valence, tempo, time_signature
FROM playlist_tracks pt
JOIN artists a ON pt.track_uri = a.track_uri
JOIN audio_features af ON pt.track_uri = af.track_uri
ORDER BY playlist_uri, track_uri
OFFSET 1894425;''', engine)
f = open("C:\\Users\\Chase\\OneDrive\\Documents\\Career Development\\Data Science Club\\Spring 2023 Club Project\\oltp_output2.csv", "w")
f.truncate()
f.close()

all_data2.to_csv('C:\\Users\\Chase\\OneDrive\\Documents\\Career Development\\Data Science Club\\Spring 2023 Club Project\\oltp_output2.csv', mode="w+", index=False)

In [None]:
features1 = all_data1[['track_uri', 'track_name', 'artist1', 'artist2', 'artist3', 'artist4', 'artist5', 'album_name', 'release_date', 'danceability', 'energy', 'track_popularity', 'acousticness', 'valence', 'tempo']]
features2 = all_data2[['track_uri', 'track_name', 'artist1', 'artist2', 'artist3', 'artist4', 'artist5', 'album_name', 'release_date', 'danceability', 'energy', 'track_popularity', 'acousticness', 'valence', 'tempo']]
features = pd.concat([features1, features2])
features = features.drop_duplicates(subset='track_uri')
features.to_parquet('C:\\Users\\Chase\\OneDrive\\Documents\\UVU-2022-2023\\features.parquet.gzip', compression='gzip')