## Import All Required Libraries

In [1]:
# Import all connection libraries

from spotipy import SpotifyClientCredentials
import spotipy
import sqlalchemy as sql
import psycopg2

# Import all data manipulation libraries

import pandas as pd
from pandarallel import pandarallel
import numpy as np
from tqdm import tqdm
pandarallel.initialize(verbose=0, nb_workers=8)

In [2]:
from functools import lru_cache

# Use the lru_cache decorator to cache the result of the function
# maxsize=1000 means the cache will store the result of up to 1000 items
@lru_cache(maxsize=1000)
def get_playlist_tracks(playlist_uri):
    """
    This function uses the Spotify API to get the tracks of a given playlist.
    The function is decorated with the lru_cache decorator to cache the result for each unique playlist_uri.
    So, if the same playlist_uri is passed to the function again, the cached result will be returned
    instead of making a new API call.
    :param playlist_uri: The Spotify URI of the playlist
    :return: A DataFrame containing the tracks of the playlist
    """
    # Use the Spotify API to get the tracks of the playlist
    tracks = pd.json_normalize(sp.playlist_tracks(playlist_uri), record_path=['items'])
    return tracks

## Load Initial Training Data

In [3]:
# establish connection to Spotify API

cid = '3fda75b7146a4769b207ee44017b3abe'
secret = '2a755cb04a18406b9394dbef2f8069dd'
client_credentials = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials, requests_timeout=15, retries=10)

# establish connection to Postgres

host_name = 'localhost'
database_name = 'Spotify'
engine = sql.create_engine('postgresql+psycopg2://postgres:DataNerd2023!!\
@localhost/Spotify')

# load training data
data = pd.read_csv('distinct_playlists.csv')[['playlist_uri', 'playlist_name']]
data['playlist_uri'] = data['playlist_uri'].str.strip()

data2 = pd.read_sql('SELECT DISTINCT playlist_uri FROM playlist_tracks', engine)

outer = data.merge(data2, how='outer', indicator=True)
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

data = pd.DataFrame(anti_join)
data = data[0:250]
data.head()

Unnamed: 0,playlist_uri,playlist_name
5,spotify:playlist:009iSBUVKTJv0UFCfQow2t,Rap Workout 2022: Hip Hop Music For The Gym ??
6,spotify:playlist:00g1IGwpyKIxnWYtYr3VyT,EDM 2011
7,spotify:playlist:00GARohVljsECuUcNSfvSz,Evil EDM
8,spotify:playlist:00K8YK8wHcMqrUjQA4KOub,????MANYAO//EDM//TECHNO//NIGHTCORE??
9,spotify:playlist:00kEOANOLMZ5SvzfL5EYOc,Hip Hop Gospel


## Iterate Through Training Data to Change Grain of Data

In [7]:
test_list = []
series = data['playlist_uri'].to_dict()
for playlist in tqdm(series.values()):
    try:
        tracks = get_playlist_tracks(playlist)
        if len(tracks) >= 100:
                tracks2 = pd.json_normalize(sp.playlist_tracks(playlist, offset=100), record_path=['items'])
                tracks3 = pd.json_normalize(sp.playlist_tracks(playlist, offset=200), record_path=['items'])
                tracks = pd.concat([tracks, tracks2, tracks3])
                secondary_data = pd.json_normalize(sp.playlist(playlist))
                tracks['playlist_uri'] = playlist
                tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
                tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri"})
                tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])

                test_list.append(tracks)
        else:
                secondary_data = pd.json_normalize(sp.playlist(playlist))
                tracks['playlist_uri'] = playlist
                tracks = tracks.merge(secondary_data, left_on='playlist_uri', right_on='uri')
                tracks = tracks.rename(columns={"name":"playlist_name", "track.name":"track_name", "track.uri":"track_uri", "track.album.name":"album_name", "track.explicit":"isExplicit", "track.album.release_date":"release_date", "track.duration_ms":"duration_ms", "track.album.uri":"album_uri"})
                tracks = tracks.drop(columns=['track.album.artists', 'track.album.available_markets', 'track.album.images', 'track.artists', 'track.available_markets', 'images', 'tracks.items'])
                test_list.append(tracks)
    except:
            pass
test_list = pd.concat(test_list)
test_list.shape

  6%|▋         | 16/250 [00:16<03:38,  1.07it/s]HTTP Error for GET to https://api.spotify.com/v1/playlists/01clVI1RPrF8zHMxrqVIs0/tracks with Params: {'limit': 100, 'offset': 0, 'fields': None, 'market': None, 'additional_types': 'track'} returned 404 due to Not found.
 44%|████▎     | 109/250 [01:48<02:55,  1.25s/it]HTTP Error for GET to https://api.spotify.com/v1/playlists/07Kw4TRKVUaIzJdiA86Kvu/tracks with Params: {'limit': 100, 'offset': 0, 'fields': None, 'market': None, 'additional_types': 'track'} returned 404 due to Not found.
 46%|████▋     | 116/250 [01:53<01:58,  1.13it/s]

## Upload Training Data to playlist_tracks Table

In [None]:
#test_list.to_sql('playlist_tracks', engine, if_exists='append')

## Identify All Artists in Every Song

In [None]:
df = pd.read_sql('''SELECT DISTINCT track_uri FROM playlist_tracks ORDER BY track_uri''', engine)
df = pd.Series(df['track_uri'])
base_list = []
for track in tqdm(df):
    try:
        df2 = pd.json_normalize(sp.track(track), record_path=['artists'])
        df2['track_uri'] = track
        df2 = df2[['name', 'track_uri']]
        base_list.append(df2)
    except:
        pass
df2 = pd.concat(base_list)
df2

100%|██████████| 644/644 [01:15<00:00,  8.53it/s]


Unnamed: 0,name,track_uri
0,Avril Lavigne,spotify:track:00Mb3DuaIH1kjrwOku9CGU
0,2 Chainz,spotify:track:00QyLmjxaSEE8qIZQjBXBj
1,Wiz Khalifa,spotify:track:00QyLmjxaSEE8qIZQjBXBj
0,Ayo & Teo,spotify:track:01stPT7J3W6Zx45jj1f4nk
0,Internet Money,spotify:track:02kDW379Yfd5PzW5A6vuGt
...,...,...
0,SLANDER,spotify:track:7zKOcKHZYFiPKlJ4NdPl00
1,Said The Sky,spotify:track:7zKOcKHZYFiPKlJ4NdPl00
2,JT Roach,spotify:track:7zKOcKHZYFiPKlJ4NdPl00
3,Brondo,spotify:track:7zKOcKHZYFiPKlJ4NdPl00


## Get Row Numbers for Pivoting

In [None]:
df2['RN'] = df2.groupby("track_uri")["name"].rank(method="first", ascending=True)
df2

Unnamed: 0,name,track_uri,RN
0,Avril Lavigne,spotify:track:00Mb3DuaIH1kjrwOku9CGU,1.0
0,2 Chainz,spotify:track:00QyLmjxaSEE8qIZQjBXBj,1.0
1,Wiz Khalifa,spotify:track:00QyLmjxaSEE8qIZQjBXBj,2.0
0,Ayo & Teo,spotify:track:01stPT7J3W6Zx45jj1f4nk,1.0
0,Internet Money,spotify:track:02kDW379Yfd5PzW5A6vuGt,3.0
...,...,...,...
0,SLANDER,spotify:track:7zKOcKHZYFiPKlJ4NdPl00,3.0
1,Said The Sky,spotify:track:7zKOcKHZYFiPKlJ4NdPl00,4.0
2,JT Roach,spotify:track:7zKOcKHZYFiPKlJ4NdPl00,2.0
3,Brondo,spotify:track:7zKOcKHZYFiPKlJ4NdPl00,1.0


## Pivot DataFrame to Fit all Artists in DataFrame

In [None]:
import numpy as np
conditions = [(df2['RN'] == 1.0), (df2['RN'] == 2.0), (df2['RN'] == 3.0), (df2['RN'] == 4.0)]
choices = ['artist1', 'artist2', 'artist3', 'artist4']
df2['artists'] = np.select(conditions, choices, default='null')

df2 = df2.pivot(index='track_uri', columns=['artists'], values='name')
df2 = df2.drop(columns='null')
df2

artists,artist1,artist2,artist3,artist4
track_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
spotify:track:00Mb3DuaIH1kjrwOku9CGU,Avril Lavigne,,,
spotify:track:00QyLmjxaSEE8qIZQjBXBj,2 Chainz,Wiz Khalifa,,
spotify:track:01stPT7J3W6Zx45jj1f4nk,Ayo & Teo,,,
spotify:track:02XnQdf7sipaKBBHixz3Zp,Lady Gaga,,,
spotify:track:02kDW379Yfd5PzW5A6vuGt,Don Toliver,Gunna,Internet Money,NAV
...,...,...,...,...
spotify:track:7yPPKfvIn97GDkffy1oV7w,Akylla,Kill The Noise,NGHTMRE,SNAILS
spotify:track:7ycWLEP1GsNjVvcjawXz3z,A$AP Rocky,Skepta,,
spotify:track:7ytR5pFWmSjzHJIeQkgog4,DaBaby,Roddy Ricch,,
spotify:track:7zKOcKHZYFiPKlJ4NdPl00,Brondo,JT Roach,SLANDER,Said The Sky


## Upload All Artist Data to Database

In [None]:
#df2.to_sql('artists', con=engine, if_exists='append')

## Get All Audio Features

In [None]:
df = pd.read_sql('''SELECT DISTINCT track_uri FROM playlist_tracks ORDER BY track_uri''', engine)
df = pd.Series(df['track_uri'])
base_list = []
for track in tqdm(df):
    try:
        df3 = pd.json_normalize(sp.audio_features(track))
        df3['track_uri'] = track
        df3 = df3[['track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
        base_list.append(df3)
    except:
        pass
df3 = pd.concat(base_list)
df3

100%|██████████| 644/644 [00:59<00:00, 10.84it/s]


Unnamed: 0,track_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,spotify:track:00Mb3DuaIH1kjrwOku9CGU,0.487,0.900,0,-4.417,1,0.0482,0.000068,0.000000,0.3580,0.484,149.937,204000,4
0,spotify:track:00QyLmjxaSEE8qIZQjBXBj,0.554,0.899,8,-4.573,1,0.4080,0.052100,0.000000,0.0568,0.552,171.966,227893,4
0,spotify:track:01stPT7J3W6Zx45jj1f4nk,0.886,0.567,11,-6.960,1,0.1910,0.016500,0.000178,0.1030,0.283,140.043,220227,4
0,spotify:track:02kDW379Yfd5PzW5A6vuGt,0.799,0.660,1,-6.153,0,0.0790,0.256000,0.000000,0.1110,0.471,140.040,195429,4
0,spotify:track:02XnQdf7sipaKBBHixz3Zp,0.762,0.692,5,-3.973,0,0.0438,0.113000,0.000000,0.0940,0.397,114.906,208307,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,spotify:track:7yNK27ZTpHew0c55VvIJgm,0.599,0.587,5,-5.919,1,0.0423,0.274000,0.000000,0.1670,0.373,87.998,280787,4
0,spotify:track:7yPPKfvIn97GDkffy1oV7w,0.631,0.966,1,-3.101,0,0.3270,0.008500,0.225000,0.2070,0.202,155.022,193032,4
0,spotify:track:7ytR5pFWmSjzHJIeQkgog4,0.746,0.690,11,-7.956,1,0.1640,0.247000,0.000000,0.1010,0.497,89.977,181733,4
0,spotify:track:7zKOcKHZYFiPKlJ4NdPl00,0.548,0.684,8,-2.976,1,0.0340,0.028600,0.000911,0.3560,0.297,74.944,284800,4


## Upload Audio Features to Database

In [None]:
#df3.to_sql('audio_features', con=engine, if_exists='append')