## Import All Required Libraries

In [1]:
# Import all connection libraries

from spotipy import SpotifyClientCredentials
import spotipy
import sqlalchemy as sql
import psycopg2
import requests

# Import all data manipulation libraries

import pandas as pd
from pandarallel import pandarallel
import numpy as np
from tqdm import tqdm
pandarallel.initialize(verbose=0, nb_workers=8)

In [2]:
from functools import lru_cache

# Use the lru_cache decorator to cache the result of the function
# maxsize=1000 means the cache will store the result of up to 1000 items
@lru_cache(maxsize=1000)
def get_playlist_tracks(playlist_uri):
    """
    This function uses the Spotify API to get the tracks of a given playlist.
    The function is decorated with the lru_cache decorator to cache the result for each unique playlist_uri.
    So, if the same playlist_uri is passed to the function again, the cached result will be returned
    instead of making a new API call.
    :param playlist_uri: The Spotify URI of the playlist
    :return: A DataFrame containing the tracks of the playlist
    """
    # Use the Spotify API to get the tracks of the playlist
    tracks = pd.json_normalize(sp.playlist_tracks(playlist_uri), record_path=['items'])
    return tracks

## Establish External Connections

In [3]:
# establish connection to Spotify API

cid = 'e5448a8a4fdc4b5d98b44e956d50546d'
secret = '8924c0394d3f49a4a569fc03e891aa1b'
client_credentials = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials, requests_timeout=15, retries=10)

# establish connection to Postgres

host_name = 'localhost'
database_name = 'Spotify'
engine = sql.create_engine('postgresql+psycopg2://postgres:DataNerd2023!!\
@localhost/Spotify')

## Load Initial Training Data

In [4]:

# load training data
# new_batch = pd.read_csv('distinct_playlists.csv')[['playlist_uri', 'playlist_name']]
# new_batch['playlist_uri'] = new_batch['playlist_uri'].str.strip()

# db_query = pd.read_sql('SELECT DISTINCT playlist_uri FROM playlist_tracks', engine)

# outer = new_batch.merge(db_query, how='outer', indicator=True)
# anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

# new_batch = pd.DataFrame(anti_join)
# new_batch = new_batch[9:29]
# new_batch.head()

## Iterate Through Training Data to Change Grain of Data

In [5]:
# load_batch = []
# series = new_batch['playlist_uri'].to_dict()
# for playlist in tqdm(series.values()):
#     try:
#         tracks = get_playlist_tracks(playlist)
#         if len(tracks) >= 100:
#                 tracks2 = pd.json_normalize(sp.playlist_tracks(playlist, offset=100), record_path=['items'])
#                 tracks3 = pd.json_normalize(sp.playlist_tracks(playlist, offset=200), record_path=['items'])
#                 tracks = pd.concat([tracks, tracks2, tracks3])
#                 secondary_data = pd.json_normalize(sp.playlist(playlist))
#                 tracks['playlist_uri'] = playlist
#                 tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
#                 tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri", "added_by.external_urls.spotify": "added_by_external_urls_spotify", 'added_by.href':"added_by_href", "added_by.id":"added_by_id", "added_by.type":"added_by_type", "added_by.uri":"added_by_uri", "track.album.album_type":"track_album_album_type", "track.album.external_urls.spotify":"track_album_external_urls_spotify", "track.album.href":"track_album_href", "track.album.id":"track_album_id", "track.album.release_date_precision":"track_album_release_date_precision", "track.album.total_tracks":"track_album_total_tracks","track.album.type":"track_album_type", "track.disc_number":"track_disc_number", "track.episode":"track_episode", "track.external_ids.isrc":"track_external_ids_isrc", "track.external_urls.spotify":"track_external_ids_spotify", "track.href":"track_href", "track.id":"track_id", "track.is_local":"track_is_local", "track.popularity":"track_popularity", "track.preview_url":"track_preview_url", "track.track":"track_track", "track.track_number":"track_track_number", "track.type":"track_type", "video_thumbnail.url":"video_thumbnail_url", "external_urls.spotify":"external_urls_spotify", "followers.href":"followers_href", "followers.total":"followers_total", "owner.display_name":"owner_display_name", "owner.external_urls.spotify":"owner_external_urls_spotify", "owner.href":"owner_href", "owner.id":"owner_id", "owner.type":"owner_type", "owner.uri":"owner_uri", "tracks.href":"tracks_href", "tracks.limit":"tracks_limit", "tracks.next":"tracks_next", "tracks.offset":"tracks_offset", "tracks.previous":"tracks_previous", "tracks.total":"tracks_total"})
#                 tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])

#                 load_batch.append(tracks)
#         else:
#                 secondary_data = pd.json_normalize(sp.playlist(playlist))
#                 tracks['playlist_uri'] = playlist
#                 tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
#                 tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri", "added_by.external_urls.spotify": "added_by_external_urls_spotify", 'added_by.href':"added_by_href", "added_by.id":"added_by_id", "added_by.type":"added_by_type", "added_by.uri":"added_by_uri", "track.album.album_type":"track_album_album_type", "track.album.external_urls.spotify":"track_album_external_urls_spotify", "track.album.href":"track_album_href", "track.album.id":"track_album_id", "track.album.release_date_precision":"track_album_release_date_precision", "track.album.total_tracks":"track_album_total_tracks","track.album.type":"track_album_type", "track.disc_number":"track_disc_number", "track.episode":"track_episode", "track.external_ids.isrc":"track_external_ids_isrc", "track.external_urls.spotify":"track_external_ids_spotify", "track.href":"track_href", "track.id":"track_id", "track.is_local":"track_is_local", "track.popularity":"track_popularity", "track.preview_url":"track_preview_url", "track.track":"track_track", "track.track_number":"track_track_number", "track.type":"track_type", "video_thumbnail.url":"video_thumbnail_url", "external_urls.spotify":"external_urls_spotify", "followers.href":"followers_href", "followers.total":"followers_total", "owner.display_name":"owner_display_name", "owner.external_urls.spotify":"owner_external_urls_spotify", "owner.href":"owner_href", "owner.id":"owner_id", "owner.type":"owner_type", "owner.uri":"owner_uri", "tracks.href":"tracks_href", "tracks.limit":"tracks_limit", "tracks.next":"tracks_next", "tracks.offset":"tracks_offset", "tracks.previous":"tracks_previous", "tracks.total":"tracks_total"})
#                 tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])
#                 load_batch.append(tracks)
#     except:
#             pass
# load_batch = pd.concat(load_batch)
# load_batch.head()

## Upload Training Data to playlist_tracks Table

In [6]:
#load_batch.to_sql('playlist_tracks', engine, if_exists='append')

## Identify All Artists in Every Song

In [7]:
df = pd.read_sql('''SELECT DISTINCT track_uri FROM artists''', engine)
df2 = pd.read_sql('''SELECT DISTINCT track_uri FROM audio_features''', engine)
outer = df2.merge(df, how='outer', indicator=True)
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

df = pd.Series(anti_join['track_uri'].head(2000))
base_list = []
for track in tqdm(df):
    try:
        df2 = pd.json_normalize(sp.track(track), record_path=['artists'])
        df2['track_uri'] = track
        df2 = df2[['name', 'track_uri']]
        base_list.append(df2)
    except:
        pass
df2 = pd.concat(base_list)
df2

100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Unnamed: 0,name,track_uri
0,Nicki Minaj,spotify:track:0WtVtCK5SEFvyyA5V8D9CI
1,Drake,spotify:track:0WtVtCK5SEFvyyA5V8D9CI
2,Lil Wayne,spotify:track:0WtVtCK5SEFvyyA5V8D9CI
0,A$AP Ferg,spotify:track:7gbs274CmEorDmQuOYzaud
1,Meek Mill,spotify:track:7gbs274CmEorDmQuOYzaud
...,...,...
1,Davido,spotify:track:6XDHjqVzv0JzS9fnQEVJ1W
0,Shlohmo,spotify:track:629Cjw0fUyZUMkBjnjttDR
0,Rema,spotify:track:3BnDvpeuGOj21Ir2aVEtQo
0,Joshua Micah,spotify:track:7ozXoMBdjxtS3U2tbzo9Ay


## Get Row Numbers for Pivoting

In [8]:
df2['RN'] = df2.groupby("track_uri")["name"].rank(method="first", ascending=True)
df2

Unnamed: 0,name,track_uri,RN
0,Nicki Minaj,spotify:track:0WtVtCK5SEFvyyA5V8D9CI,3.0
1,Drake,spotify:track:0WtVtCK5SEFvyyA5V8D9CI,1.0
2,Lil Wayne,spotify:track:0WtVtCK5SEFvyyA5V8D9CI,2.0
0,A$AP Ferg,spotify:track:7gbs274CmEorDmQuOYzaud,1.0
1,Meek Mill,spotify:track:7gbs274CmEorDmQuOYzaud,2.0
...,...,...,...
1,Davido,spotify:track:6XDHjqVzv0JzS9fnQEVJ1W,1.0
0,Shlohmo,spotify:track:629Cjw0fUyZUMkBjnjttDR,1.0
0,Rema,spotify:track:3BnDvpeuGOj21Ir2aVEtQo,1.0
0,Joshua Micah,spotify:track:7ozXoMBdjxtS3U2tbzo9Ay,1.0


## Pivot DataFrame to Fit all Artists in DataFrame

In [9]:
df2 = df2.pivot(index='track_uri', columns=['RN'], values='name').reset_index()
df2 = df2.rename(columns={1.0:'artist1', 2.0:'artist2', 3.0:'artist3', 4.0:'artist4', 5.0:'artist5'})
df2 = df2[['track_uri', 'artist1', 'artist2', 'artist3', 'artist4', 'artist5']]
df2

RN,track_uri,artist1,artist2,artist3,artist4,artist5
0,spotify:track:00KHrXKOfWvloluo9OFgjl,Tom. G,XXXTENTACION,,,
1,spotify:track:00iQ29Q9sF7SuRassUh5F8,Clay Walker,,,,
2,spotify:track:01mNrMDaDseSKzrTmRfqKa,Glup!,,,,
3,spotify:track:01wuIn1TP5FCvYyqBTdsGC,Jess Glynne,Tinie Tempah,,,
4,spotify:track:022X40ctU5PIHD8YUjS9qZ,Safri Duo,,,,
...,...,...,...,...,...,...
1995,spotify:track:7ygpwy2qP3NbrxVkHvUhXY,Oasis,,,,
1996,spotify:track:7ytLGF9Ua5g5MylbwtxzZC,Gorilla Zoe,,,,
1997,spotify:track:7yyRTcZmCiyzzJlNzGC9Ol,DRAM,Lil Yachty,,,
1998,spotify:track:7z1xQUZKgpXXdCquuSRjRC,Rascal Flatts,,,,


## Upload All Artist Data to Database

In [10]:
df2.to_sql('artists', con=engine, if_exists='append')

1000

## Get All Audio Features

In [11]:
# df = pd.read_sql('''SELECT DISTINCT track_uri FROM audio_features''', engine)
# outer = load_batch.merge(df, how='outer', indicator=True)
# anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)
# df = pd.Series(anti_join['track_uri'].head(5000))

# base_list = []
# for track in tqdm(df):
#     try:
#         df3 = pd.json_normalize(sp.audio_features(track))
#         df3['track_uri'] = track
#         df3 = df3[['track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
#         base_list.append(df3)
#     except:
#         pass
# df3 = pd.concat(base_list)

## Upload Audio Features to Database

In [12]:
#df3.to_sql('audio_features', con=engine, if_exists='append')

In [13]:
all_data = pd.read_sql('''SELECT DISTINCT playlist_uri, playlist_name, owner_uri, owner_display_name, collaborative, description, followers_total,
af.track_uri, track_name, artist1, artist2, artist3, artist4, artist5, album_uri, album_name, release_date, 
added_by_uri, added_at, track_album_total_tracks, af.duration_ms, "isExplicit", track_popularity, 
track_preview_url, video_thumbnail_url, danceability, energy, loudness, key, mode, acousticness, speechiness, 
instrumentalness, liveness, valence, tempo, time_signature
FROM playlist_tracks pt
JOIN artists a ON pt.track_uri = a.track_uri
JOIN audio_features af ON pt.track_uri = af.track_uri;''', engine)
f = open("s3.csv", "w")
f.truncate()
f.close()

all_data.to_csv('s3.csv', mode="w+")