## Import All Required Libraries

In [1]:
# Import all connection libraries

from spotipy import SpotifyClientCredentials
import spotipy
import sqlalchemy as sql
import psycopg2

# Import all data manipulation libraries

import pandas as pd
from pandarallel import pandarallel
import numpy as np
from tqdm import tqdm
pandarallel.initialize(verbose=0, nb_workers=8)

In [2]:
from functools import lru_cache

# Use the lru_cache decorator to cache the result of the function
# maxsize=1000 means the cache will store the result of up to 1000 items
@lru_cache(maxsize=1000)
def get_playlist_tracks(playlist_uri):
    """
    This function uses the Spotify API to get the tracks of a given playlist.
    The function is decorated with the lru_cache decorator to cache the result for each unique playlist_uri.
    So, if the same playlist_uri is passed to the function again, the cached result will be returned
    instead of making a new API call.
    :param playlist_uri: The Spotify URI of the playlist
    :return: A DataFrame containing the tracks of the playlist
    """
    # Use the Spotify API to get the tracks of the playlist
    tracks = pd.json_normalize(sp.playlist_tracks(playlist_uri), record_path=['items'])
    return tracks

## Load Initial Training Data

In [3]:
# establish connection to Spotify API

cid = '3fda75b7146a4769b207ee44017b3abe'
secret = 'aaaa11ed71604ff19d592070be0f6bbd'
client_credentials = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials, requests_timeout=15, retries=10)

# establish connection to Postgres

host_name = 'localhost'
database_name = 'Spotify'
engine = sql.create_engine('postgresql+psycopg2://postgres:DataNerd2023!!\
@localhost/Spotify')

# load training data
new_batch = pd.read_csv('distinct_playlists.csv')[['playlist_uri', 'playlist_name']]
new_batch['playlist_uri'] = new_batch['playlist_uri'].str.strip()

db_query = pd.read_sql('SELECT DISTINCT playlist_uri FROM playlist_tracks', engine)

outer = new_batch.merge(db_query, how='outer', indicator=True)
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

new_batch = pd.DataFrame(anti_join)
new_batch = new_batch[5:10]
new_batch.head()

Unnamed: 0,playlist_uri,playlist_name
687,spotify:playlist:0Pq0GkVjNxLtZHskNbi1mu,soft pop/sleep
775,spotify:playlist:0S0cuX8pnvmF7gA47Eu63M,New EDM This Week - New EDM 2022 - New Dance M...
776,spotify:playlist:0S4RgkTyzk7fnbcvbjJZou,Planet Fitness Workout
777,spotify:playlist:0s7bjJw4PovtWM5JzTItgn,Deep Tribal Jungle House
778,spotify:playlist:0s7X4BlXmMjhvfc17MaDUq,TOP RAP 2021 ?? Best Rap Hip-Hop Hits


In [4]:
sp.playlist('spotify:playlist:0ruchmtSqSR9Xjus3GIKHA')

{'collaborative': False,
 'description': 'the anger issues are over the top at this point',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/0ruchmtSqSR9Xjus3GIKHA'},
 'followers': {'href': None, 'total': 763},
 'href': 'https://api.spotify.com/v1/playlists/0ruchmtSqSR9Xjus3GIKHA?additional_types=track',
 'id': '0ruchmtSqSR9Xjus3GIKHA',
 'images': [{'height': None,
   'url': 'https://i.scdn.co/image/ab67706c0000bebbacce4a8fc3973d5af0ff8b1d',
   'width': None}],
 'name': "iwaizumi kinnies who can't show affection",
 'owner': {'display_name': '?miron?',
  'external_urls': {'spotify': 'https://open.spotify.com/user/6rpop5f33pq566kefs2ja24uy'},
  'href': 'https://api.spotify.com/v1/users/6rpop5f33pq566kefs2ja24uy',
  'id': '6rpop5f33pq566kefs2ja24uy',
  'type': 'user',
  'uri': 'spotify:user:6rpop5f33pq566kefs2ja24uy'},
 'primary_color': None,
 'public': True,
 'snapshot_id': 'NTMsZWZkNzcxNWI0MzVhMjM4N2I3YmMyMjFkNjlkODljNTRhMTliYTU2ZA==',
 'tracks': {'href': 'https://api.sp

## Iterate Through Training Data to Change Grain of Data

In [5]:
load_batch = []
series = new_batch['playlist_uri'].to_dict()
for playlist in tqdm(series.values()):
    try:
        tracks = get_playlist_tracks(playlist)
        if len(tracks) >= 100:
                tracks2 = pd.json_normalize(sp.playlist_tracks(playlist, offset=100), record_path=['items'])
                tracks3 = pd.json_normalize(sp.playlist_tracks(playlist, offset=200), record_path=['items'])
                tracks = pd.concat([tracks, tracks2, tracks3])
                secondary_data = pd.json_normalize(sp.playlist(playlist))
                tracks['playlist_uri'] = playlist
                tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
                tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri", "added_by.external_urls.spotify": "added_by_external_urls_spotify", 'added_by.href':"added_by_href", "added_by.id":"added_by_id", "added_by.type":"added_by_type", "added_by.uri":"added_by_uri", "track.album.album_type":"track_album_album_type", "track.album.external_urls.spotify":"track_album_external_urls_spotify", "track.album.href":"track_album_href", "track.album.id":"track_album_id", "track.album.release_date_precision":"track_album_release_date_precision", "track.album.total_tracks":"track_album_total_tracks","track.album.type":"track_album_type", "track.disc_number":"track_disc_number", "track.episode":"track_episode", "track.external_ids.isrc":"track_external_ids_isrc", "track.external_urls.spotify":"track_external_ids_spotify", "track.href":"track_href", "track.id":"track_id", "track.is_local":"track_is_local", "track.popularity":"track_popularity", "track.preview_url":"track_preview_url", "track.track":"track_track", "track.track_number":"track_track_number", "track.type":"track_type", "video_thumbnail.url":"video_thumbnail_url", "external_urls.spotify":"external_urls_spotify", "followers.href":"followers_href", "followers.total":"followers_total", "owner.display_name":"owner_display_name", "owner.external_urls.spotify":"owner_external_urls_spotify", "owner.href":"owner_href", "owner.id":"owner_id", "owner.type":"owner_type", "owner.uri":"owner_uri", "tracks.href":"tracks_href", "tracks.limit":"tracks_limit", "tracks.next":"tracks_next", "tracks.offset":"tracks_offset", "tracks.previous":"tracks_previous", "tracks.total":"tracks_total"})
                tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])

                load_batch.append(tracks)
        else:
                secondary_data = pd.json_normalize(sp.playlist(playlist))
                tracks['playlist_uri'] = playlist
                tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
                tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri", "added_by.external_urls.spotify": "added_by_external_urls_spotify", 'added_by.href':"added_by_href", "added_by.id":"added_by_id", "added_by.type":"added_by_type", "added_by.uri":"added_by_uri", "track.album.album_type":"track_album_album_type", "track.album.external_urls.spotify":"track_album_external_urls_spotify", "track.album.href":"track_album_href", "track.album.id":"track_album_id", "track.album.release_date_precision":"track_album_release_date_precision", "track.album.total_tracks":"track_album_total_tracks","track.album.type":"track_album_type", "track.disc_number":"track_disc_number", "track.episode":"track_episode", "track.external_ids.isrc":"track_external_ids_isrc", "track.external_urls.spotify":"track_external_ids_spotify", "track.href":"track_href", "track.id":"track_id", "track.is_local":"track_is_local", "track.popularity":"track_popularity", "track.preview_url":"track_preview_url", "track.track":"track_track", "track.track_number":"track_track_number", "track.type":"track_type", "video_thumbnail.url":"video_thumbnail_url", "external_urls.spotify":"external_urls_spotify", "followers.href":"followers_href", "followers.total":"followers_total", "owner.display_name":"owner_display_name", "owner.external_urls.spotify":"owner_external_urls_spotify", "owner.href":"owner_href", "owner.id":"owner_id", "owner.type":"owner_type", "owner.uri":"owner_uri", "tracks.href":"tracks_href", "tracks.limit":"tracks_limit", "tracks.next":"tracks_next", "tracks.offset":"tracks_offset", "tracks.previous":"tracks_previous", "tracks.total":"tracks_total"})
                tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])
                load_batch.append(tracks)
    except:
            pass
load_batch = pd.concat(load_batch)
load_batch.head()

  0%|          | 0/5 [00:00<?, ?it/s]HTTP Error for GET to https://api.spotify.com/v1/playlists/0Pq0GkVjNxLtZHskNbi1mu/tracks with Params: {'limit': 100, 'offset': 0, 'fields': None, 'market': None, 'additional_types': 'track'} returned 404 due to Not found.
100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Unnamed: 0,added_at,is_local,primary_color_x,added_by_external_urls_spotify,added_by_href,added_by_id,added_by_type,added_by_uri,track_album_album_type,track_album_external_urls_spotify,...,owner_href,owner_id,owner_type,owner_uri,tracks_href,tracks_limit,tracks_next,tracks_offset,tracks_previous,tracks_total
0,2023-02-09T14:35:54Z,False,,https://open.spotify.com/user/freeql727jautay0...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,album,https://open.spotify.com/album/6JlmbLrsozWe61r...,...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,https://api.spotify.com/v1/playlists/0S0cuX8pn...,100,,0,,96
1,2023-02-09T14:35:54Z,False,,https://open.spotify.com/user/freeql727jautay0...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,single,https://open.spotify.com/album/2yMcWYGtKE0e47u...,...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,https://api.spotify.com/v1/playlists/0S0cuX8pn...,100,,0,,96
2,2023-02-09T14:35:54Z,False,,https://open.spotify.com/user/freeql727jautay0...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,single,https://open.spotify.com/album/1FxJT4dFth8T9Z1...,...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,https://api.spotify.com/v1/playlists/0S0cuX8pn...,100,,0,,96
3,2023-02-09T14:35:54Z,False,,https://open.spotify.com/user/freeql727jautay0...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,single,https://open.spotify.com/album/33utleg13XIBr1B...,...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,https://api.spotify.com/v1/playlists/0S0cuX8pn...,100,,0,,96
4,2023-02-09T14:35:54Z,False,,https://open.spotify.com/user/freeql727jautay0...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,single,https://open.spotify.com/album/7iu8zkhlxrwVoWA...,...,https://api.spotify.com/v1/users/freeql727jaut...,freeql727jautay0upi97yesm,user,spotify:user:freeql727jautay0upi97yesm,https://api.spotify.com/v1/playlists/0S0cuX8pn...,100,,0,,96


## Upload Training Data to playlist_tracks Table

In [6]:
load_batch.to_sql('playlist_tracks', engine, if_exists='append')

762

## Identify All Artists in Every Song

In [7]:
df = pd.read_sql('''SELECT DISTINCT track_uri FROM playlist_tracks''', engine)
outer = df.merge(load_batch, how='outer', indicator=True)
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

df = pd.Series(anti_join['track_uri'].head(10000))
base_list = []
for track in tqdm(df):

    try:
        print("It's me")
        df2 = pd.json_normalize(sp.track(track), record_path=['artists'])
        print("It works?")
        df2['track_uri'] = track
        df2 = df2[['name', 'track_uri']]
        base_list.append(df2)
    except:
        print('Hello')
df2 = pd.concat(base_list)
df2

  0%|          | 0/10000 [00:00<?, ?it/s]

It's me
Hello
It's me


## Get Row Numbers for Pivoting

In [None]:
df2['RN'] = df2.groupby("track_uri")["name"].rank(method="first", ascending=True)
df2

Unnamed: 0,name,track_uri,RN
0,Dorrough Music,spotify:track:3lwMGZ9M5Gq4usaZZLV4NK,1.0
0,Arman Cekin,spotify:track:364vyXytDCU4KVgjrDmVQX,1.0
1,Esther Sparkes,spotify:track:364vyXytDCU4KVgjrDmVQX,2.0
0,LoCraft,spotify:track:2ZTvUeTWseCBVhPUfJsgGt,1.0
0,chromonicci,spotify:track:0zARNJ9ss6SpaDUzxB95j0,1.0
...,...,...,...
0,Flavour,spotify:track:7u8pI0Q3NLg2RWvC1lyaGm,1.0
0,$uicideboy$,spotify:track:3cQPyVlnrmmWU5fQJxl0q9,1.0
0,First Aid Kit,spotify:track:3fy2ILYi8upEuracr84cx0,1.0
0,Jason Aldean,spotify:track:5eH1ktZDHgegpkYJOY7tme,1.0


## Pivot DataFrame to Fit all Artists in DataFrame

In [None]:
df2 = df2.pivot(index='track_uri', columns=['RN'], values='name').reset_index()
df2 = df2.rename(columns={1.0:'artist1', 2.0:'artist2', 3.0:'artist3', 4.0:'artist4', 5.0:'artist5'})
df2 = df2[['track_uri', 'artist1', 'artist2', 'artist3', 'artist4', 'artist5']]
df2

RN,track_uri,artist1,artist2,artist3,artist4,artist5
0,spotify:track:000VUaYhw19g0d6tCGZkJB,6LACK,Dreamville,JID,Lute,
1,spotify:track:005lwxGU1tms6HGELIcUv9,Katy Perry,,,,
2,spotify:track:00AxNl4D4jHL2AEf1W55j5,Dierks Bentley,,,,
3,spotify:track:00EFWaqXnHZ5smJNsHtnGV,HALIENE,Seven Lions,,,
4,spotify:track:00I0pcNkN3IOX3fsYbaB4N,Takeoff,,,,
...,...,...,...,...,...,...
9990,spotify:track:7zhyWEAvCMi6W440bAkCTO,Colyn,,,,
9991,spotify:track:7zjzu90Q4mtNNaE9Ol9Zbv,AC/DC,,,,
9992,spotify:track:7zpBRUYHP0RWncVUX9mvJO,Drowning Pool,,,,
9993,spotify:track:7zubSmuSBhEmxJVcOkxvP9,Nonso Amadi,melvitto,,,


## Upload All Artist Data to Database

In [None]:
df2.to_sql('artists', con=engine, if_exists='append')

995

## Get All Audio Features

In [None]:
df = pd.read_sql('''SELECT DISTINCT track_uri FROM playlist_tracks''', engine)
outer = df.merge(load_batch, how='outer', indicator=True)
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)
df = pd.Series(anti_join['track_uri'].head(10000))

base_list = []
for track in tqdm(df):
    try:
        df3 = pd.json_normalize(sp.audio_features(track))
        df3['track_uri'] = track
        df3 = df3[['track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
        base_list.append(df3)
    except:
        pass
df3 = pd.concat(base_list)
df3

 10%|▉         | 950/10000 [01:28<13:29, 11.18it/s]  Expected id of type track but found type Arcus spotify:local:RL+GRIME+%26+GRAVES:Arcus:Arcus:201
 21%|██        | 2103/10000 [03:21<11:07, 11.84it/s]  Expected id of type track but found type R%C3%AAves+de+Star spotify:local:Corneille:Parce+Qu%27on+Vient+de+Loin+%5BBonus+Tracks%5D+Disc+1:R%C3%AAves+de+Star:241
 45%|████▌     | 4540/10000 [07:12<07:49, 11.62it/s]Expected id of type track but found type Platform+Moon spotify:local:Jupiter+One:Jupiter+One:Platform+Moon:345
 71%|███████   | 7093/10000 [11:14<04:31, 10.69it/s]Expected id of type track but found type Baby+Doll spotify:local:Virgo:Virgo:Baby+Doll:211
100%|██████████| 10000/10000 [15:55<00:00, 10.47it/s]


Unnamed: 0,track_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,spotify:track:3lwMGZ9M5Gq4usaZZLV4NK,0.849,0.5900,0,-8.367,1,0.4280,0.000545,0.000000,0.3430,0.656,180.026,237320,4
0,spotify:track:364vyXytDCU4KVgjrDmVQX,0.562,0.6900,2,-5.998,0,0.0389,0.036800,0.000066,0.1080,0.227,150.056,209608,4
0,spotify:track:2ZTvUeTWseCBVhPUfJsgGt,0.608,0.0696,0,-19.207,1,0.0457,0.812000,0.943000,0.1080,0.088,159.945,183000,4
0,spotify:track:0zARNJ9ss6SpaDUzxB95j0,0.514,0.7670,9,-9.479,1,0.2050,0.375000,0.882000,0.3030,0.718,81.146,205468,4
0,spotify:track:6IanFWu96WUD1pfTirhZBe,0.845,0.8040,10,-5.009,1,0.0609,0.041700,0.859000,0.0749,0.652,125.015,262019,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,spotify:track:7u8pI0Q3NLg2RWvC1lyaGm,0.842,0.9170,11,-3.897,0,0.1980,0.115000,0.000002,0.6590,0.539,102.971,215876,4
0,spotify:track:3cQPyVlnrmmWU5fQJxl0q9,0.678,0.7700,6,-3.206,0,0.1450,0.125000,0.000000,0.1840,0.284,75.009,133903,4
0,spotify:track:3fy2ILYi8upEuracr84cx0,0.876,0.6270,8,-6.367,1,0.0565,0.035500,0.000000,0.0958,0.638,116.075,208893,4
0,spotify:track:5eH1ktZDHgegpkYJOY7tme,0.430,0.8410,6,-2.990,1,0.0370,0.130000,0.000000,0.0517,0.489,180.977,183440,4


## Upload Audio Features to Database

In [None]:
df3.to_sql('audio_features', con=engine, if_exists='append')

991

In [None]:
all_data = pd.read_sql('''SELECT DISTINCT playlist_uri, playlist_name, owner_uri, owner_display_name, collaborative, description, followers_total
track_uri, track_name, artist1, artist2, artist3, artist4, artist5, album_uri, album_name, release_date, 
added_by_uri, added_at, track_album_total_tracks, af.duration_ms, "isExplicit", track_popularity, 
track_preview_url, video_thumbnail_url, danceability, energy, loudness, key, mode, acousticness, speechiness, 
instrumentalness, liveness, valence, tempo, time_signature
FROM playlist_tracks pt
JOIN artists a ON pt.track_uri = a.track_uri
JOIN audio_features af ON pt.track_uri = af.track_uri;''', engine)
f = open("s3.csv", "w")
f.truncate()
f.close()

all_data.to_csv('s3.csv', mode="w+")