# Analyzing Playlists from Common Features

In [311]:
import requests
import pandas as pd
import numpy as np
import json
import constants

## Extraction of playlist data from Spotify's API

In [312]:
OAuth = constants.OAuth
headers = {"Accept" : "application/json",
                        "Content-Type" : "application/json",
                        "Authorization": OAuth}

In [313]:
def get_playlists(playlist_type, country_code, limit):
    
    '''
    Returns a dictionary of containing the given amount of playlists of the provided type from the 
    country matching the country code provided.
    '''
    
    playlists = (requests.get(url=f"https://api.spotify.com/v1/browse/categories/{playlist_type}/playlists?country={country_code}&limit={limit}",
                              headers = headers)).json()
    
    return playlists

In [314]:
def extract_playlist_ids(playlists):
    
    ''' Extract the ids of the playlists to be able to query the tracks contained in the playlist using Spotify API. '''
    
    ids = []
    playlist_info = playlists['playlists']['items']
    for i in range(len(playlist_info)):
        ids.append(playlist_info[i]['id'])
    
    return ids

In [315]:
def extract_track_info(playlist_ids):
    
    ''' Extract all of the relevant information about the tracks in a playlist, given a list a playlist ids. '''
    
    track_info = []

    for playlist in playlist_ids:
        tracks = (requests.get(url=f"https://api.spotify.com/v1/playlists/{playlist}/tracks",
                               headers = headers)).json()            
  
        fields = {"id", "name", "artists", "album", "duration_ms", "popularity"}
        for i in range(len(tracks['items'])):
            if tracks['items'][i]['track']:
                info = { key:value for key,value in tracks['items'][i]['track'].items() if key in fields}
                track_info.append(info)
            else:
                print('BARK BARK BARK >:(')
    
    return track_info

In [316]:
def get_tracks(playlist_type, country_code, limit):
    
    '''
    Returns a list containing the information for various tracks.
    playlist_type: The type of playlists where you want to query songs
    country_code: The country code of the country where you want to find the playlists
    limit: The amount of playlists you want to extract tracks from
    '''
    
    playlists = get_playlists(playlist_type, country_code, limit)
    playlist_ids = extract_playlist_ids(playlists)
    track_info = extract_track_info(playlist_ids)
    
    return track_info

In [317]:
def get_track_features(track_ids, df):
    '''
    Given an array of track ids extract the audio features of the track.
    '''
    track_features = []
    lower_bound = 0
    upper_bound = 60
    while lower_bound < len(track_ids):
        track_string = ','.join(df['id'][lower_bound:upper_bound])
        features = requests.get(url=f"https://api.spotify.com/v1/audio-features",
                                params={'ids':track_string},
                                headers=headers).json()
        lower_bound += 60
        upper_bound += 60
        if features['audio_features']:
            track_features.extend(features['audio_features'])
    return track_features

In [318]:
workout_tracks = pd.DataFrame(get_tracks('workout', 'US', 8))
workout_tracks = workout_tracks[['id', 'name', 'artists', 'album','duration_ms', 'popularity']]

display(workout_tracks.head())
print(workout_tracks.shape)

Unnamed: 0,id,name,artists,album,duration_ms,popularity
0,2KH16WveTQWT6KOG9Rg6e2,Eye of the Tiger,[{'external_urls': {'spotify': 'https://open.s...,"{'album_type': 'album', 'artists': [{'external...",245640,75
1,2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,[{'external_urls': {'spotify': 'https://open.s...,"{'album_type': 'album', 'artists': [{'external...",208400,83
2,0pqnGHJpmpxLKifKRmU6WP,Believer,[{'external_urls': {'spotify': 'https://open.s...,"{'album_type': 'album', 'artists': [{'external...",204346,87
3,5BIMPccDwShpXq784RJlJp,Enter Sandman,[{'external_urls': {'spotify': 'https://open.s...,"{'album_type': 'album', 'artists': [{'external...",331573,73
4,57BrRMwf9LrcmuOsyGilwr,Crawling,[{'external_urls': {'spotify': 'https://open.s...,"{'album_type': 'album', 'artists': [{'external...",208960,73


(592, 6)


In [270]:
def extract_artist_name(artists_info):
    artist_names = []
    for i in range(len(artists_info)):
        artist_names.append(artists_info[i][0]['name'])
    
    return artist_names

In [271]:
def extract_album_name(album_info):
    album_names = []
    for i in range(len(album_info)):
        album_names.append(album_info[i]['name'])
    
    return album_names

In [272]:
workout_tracks['artists'] = extract_artist_name(workout_tracks['artists'])
workout_tracks['album'] = extract_album_name(workout_tracks['album'])
workout_tracks.head()

Unnamed: 0,id,name,artists,album,duration_ms,popularity
0,2KH16WveTQWT6KOG9Rg6e2,Eye of the Tiger,Survivor,Rocky IV,245640,75
1,2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,AC/DC,Highway to Hell,208400,83
2,0pqnGHJpmpxLKifKRmU6WP,Believer,Imagine Dragons,Evolve,204346,87
3,5BIMPccDwShpXq784RJlJp,Enter Sandman,Metallica,Metallica,331573,73
4,57BrRMwf9LrcmuOsyGilwr,Crawling,Linkin Park,Hybrid Theory (Bonus Edition),208960,73


## Extraction of Song Analyses

In [273]:
workout_track_features = pd.DataFrame(get_track_features(workout_tracks['id'], workout_tracks))
workout_track_features.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.132,https://api.spotify.com/v1/audio-analysis/2KH1...,0.817,245640,0.599,2KH16WveTQWT6KOG9Rg6e2,0.000311,0,0.0873,-9.249,0,0.0328,108.873,4,https://api.spotify.com/v1/tracks/2KH16WveTQWT...,audio_features,spotify:track:2KH16WveTQWT6KOG9Rg6e2,0.548
1,0.0591,https://api.spotify.com/v1/audio-analysis/2zYz...,0.573,208400,0.913,2zYzyRzz6pRmhPzyfMEC8s,0.00173,6,0.156,-4.793,0,0.132,115.715,4,https://api.spotify.com/v1/tracks/2zYzyRzz6pRm...,audio_features,spotify:track:2zYzyRzz6pRmhPzyfMEC8s,0.422
2,0.0622,https://api.spotify.com/v1/audio-analysis/0pqn...,0.776,204347,0.78,0pqnGHJpmpxLKifKRmU6WP,0.0,10,0.081,-4.374,0,0.128,124.949,4,https://api.spotify.com/v1/tracks/0pqnGHJpmpxL...,audio_features,spotify:track:0pqnGHJpmpxLKifKRmU6WP,0.666
3,0.00206,https://api.spotify.com/v1/audio-analysis/5BIM...,0.579,331573,0.824,5BIMPccDwShpXq784RJlJp,0.00903,6,0.059,-8.71,0,0.03,123.331,4,https://api.spotify.com/v1/tracks/5BIMPccDwShp...,audio_features,spotify:track:5BIMPccDwShpXq784RJlJp,0.635
4,0.0466,https://api.spotify.com/v1/audio-analysis/57Br...,0.58,208960,0.702,57BrRMwf9LrcmuOsyGilwr,3e-06,4,0.536,-5.565,1,0.0337,105.076,4,https://api.spotify.com/v1/tracks/57BrRMwf9Lrc...,audio_features,spotify:track:57BrRMwf9LrcmuOsyGilwr,0.299


In [274]:
# Keeping only the important audio features
workout_track_features = workout_track_features[['id', 'acousticness', 'danceability','energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']]
display(workout_track_features.head(5))
print(workout_track_features.shape)

Unnamed: 0,id,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,2KH16WveTQWT6KOG9Rg6e2,0.132,0.817,0.599,0.000311,0,0.0873,-9.249,0,0.0328,108.873,4,0.548
1,2zYzyRzz6pRmhPzyfMEC8s,0.0591,0.573,0.913,0.00173,6,0.156,-4.793,0,0.132,115.715,4,0.422
2,0pqnGHJpmpxLKifKRmU6WP,0.0622,0.776,0.78,0.0,10,0.081,-4.374,0,0.128,124.949,4,0.666
3,5BIMPccDwShpXq784RJlJp,0.00206,0.579,0.824,0.00903,6,0.059,-8.71,0,0.03,123.331,4,0.635
4,57BrRMwf9LrcmuOsyGilwr,0.0466,0.58,0.702,3e-06,4,0.536,-5.565,1,0.0337,105.076,4,0.299


(592, 13)


## Merging Data into a Consolidated Table

In [275]:
# Merge workout tracks so that all info is consolidated 
workout_tracks = pd.merge(workout_tracks, workout_track_features, on="id", how="inner").set_index('id')
workout_tracks.head()

Unnamed: 0_level_0,name,artists,album,duration_ms,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2KH16WveTQWT6KOG9Rg6e2,Eye of the Tiger,Survivor,Rocky IV,245640,75,0.132,0.817,0.599,0.000311,0,0.0873,-9.249,0,0.0328,108.873,4,0.548
2zYzyRzz6pRmhPzyfMEC8s,Highway to Hell,AC/DC,Highway to Hell,208400,83,0.0591,0.573,0.913,0.00173,6,0.156,-4.793,0,0.132,115.715,4,0.422
0pqnGHJpmpxLKifKRmU6WP,Believer,Imagine Dragons,Evolve,204346,87,0.0622,0.776,0.78,0.0,10,0.081,-4.374,0,0.128,124.949,4,0.666
0pqnGHJpmpxLKifKRmU6WP,Believer,Imagine Dragons,Evolve,204346,87,0.0622,0.776,0.78,0.0,10,0.081,-4.374,0,0.128,124.949,4,0.666
0pqnGHJpmpxLKifKRmU6WP,Believer,Imagine Dragons,Evolve,204346,87,0.0622,0.776,0.78,0.0,10,0.081,-4.374,0,0.128,124.949,4,0.666


In [276]:
print("Number of unique tracks:", workout_tracks.name.nunique())
print("Number of tracks:", len(workout_tracks))

Number of unique tracks: 545
Number of tracks: 640


### At this point we've extracted all of our necessary information. Note that there are repeated tracks, which makes sense since popular songs tend to be present in many different playlists.

In [277]:
# Out of curiosity, going to check the duplicate songs
workout_tracks.loc[workout_tracks.duplicated(subset='name')]['name'].unique()

array(['Believer', 'Another Day', 'Put Yo Hands Up', "'Till I Collapse",
       'Earthquake', "Ain't Scared (feat. Hashu)", 'Hood Anthem',
       'Lone Wolf', 'Something Just Like This', "If I Can't Have You",
       'Shape of You', 'Happier', '24K Magic', 'Rescue Me',
       'Who Do You Love', 'Never Really Over', 'The Middle',
       'More Than You Know', 'POWER', 'Stronger', 'Numb / Encore',
       "X Gon' Give It To Ya", 'Jumpman',
       'CAN\'T STOP THE FEELING! (Original Song from DreamWorks Animation\'s "TROLLS")',
       'ME! (feat. Brendon Urie of Panic! At The Disco)', 'Thunder',
       'Sucker', 'Cake By The Ocean', 'Natural',
       "I Don't Care (with Justin Bieber)", 'You Need To Calm Down',
       'Sorry', 'Youngblood', '2002', 'Despacito - Remix', 'Attention',
       'Stitches', 'Gold Digger', 'The Next Episode', 'Power',
       'Say My Name'], dtype=object)

In [278]:
workout_tracks.drop_duplicates(inplace=True)

In [279]:
print("Number of unique tracks:", workout_tracks.name.nunique())
print("Number of tracks:", len(workout_tracks))

Number of unique tracks: 545
Number of tracks: 571


### After dropping duplicates there are still tracks with duplicate names. Before dropping them entirely, it'd be wise to investigate why they weren't dropped

In [280]:
workout_tracks[workout_tracks.duplicated(subset='name', keep=False)].sort_values(by='name')

Unnamed: 0_level_0,name,artists,album,duration_ms,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2BgEsaKNfHUdlh97KmvFyo,2002,Anne-Marie,Speak Your Mind (Deluxe),186986,83,0.0372,0.697,0.683,0.0,1,0.137,-2.881,0,0.117,96.133,4,0.603
4Pbg79cTBu4vgSphoyNq3j,2002,Anne-Marie,2002,186986,77,0.0372,0.697,0.683,0.0,1,0.137,-2.881,0,0.117,96.133,4,0.603
5cF0dROlMOK5uNZtivgu50,Attention,Charlie Puth,Voicenotes,208786,82,0.109,0.775,0.613,2.3e-05,3,0.134,-4.586,0,0.0542,100.066,4,0.797
4iLqG9SeJSnt0cSPICSjxv,Attention,Charlie Puth,Attention,211475,25,0.0969,0.774,0.626,3.1e-05,3,0.0848,-4.432,0,0.0432,100.041,4,0.777
0pqnGHJpmpxLKifKRmU6WP,Believer,Imagine Dragons,Evolve,204346,87,0.0622,0.776,0.78,0.0,10,0.081,-4.374,0,0.128,124.949,4,0.666
05KfyCEE6otdlT1pp2VIjP,Believer,Imagine Dragons,Believer,203782,15,0.0417,0.772,0.775,0.0,10,0.226,-4.388,0,0.112,124.978,4,0.748
6JV2JOEocMgcZxYSZelKcc,CAN'T STOP THE FEELING! (Original Song from Dr...,Justin Timberlake,CAN'T STOP THE FEELING! (Original Song from Dr...,236001,77,0.0123,0.667,0.83,0.0,0,0.191,-5.715,1,0.0749,113.03,4,0.701
1WkMMavIMc4JZ8cfMmxHkI,CAN'T STOP THE FEELING! (Original Song from Dr...,Justin Timberlake,TROLLS (Original Motion Picture Soundtrack),237546,74,0.0106,0.669,0.832,0.0,0,0.0968,-5.72,1,0.0682,113.035,4,0.7
42ftjU4cTN5UTRksyqBKZJ,Cake By The Ocean,DNCE,SWAAY,218440,11,0.147,0.771,0.76,0.0,4,0.053,-5.459,0,0.0514,119.0,4,0.881
2Aa1wE8ofs2tu59TOQrZKW,Cake By The Ocean,DNCE,SWAAY,218440,14,0.156,0.774,0.765,0.0,4,0.0523,-5.503,0,0.0517,119.018,4,0.908


### Reasons for Duplicates:
1. Same Song Different Artist
2. Same Song Same Artist Different Album
3. Same Everything, but Different Analysis  

#### Handle the second case and third case by dropping the less popular version.

In [298]:
workout_tracks = workout_tracks.sort_values(by=['name', 'popularity'], ascending = False)
workout_tracks.drop_duplicates(subset=['name', 'artists'], inplace=True)

In [308]:
# If this passes I can safely assume all duplicates have been removed
assert not any(workout_tracks.duplicated(subset=['name', 'artists'], keep=False))

### All duplicate records have been removed. Now we need to check data types and null values.